-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patha_text_search.R
executable file
·67 lines (65 loc) · 2.16 KB
/
a_text_search.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#' Words bibliography subset
#'
#' Subset bibliography data frame using words critiria. Algorithm uses
#' "stringdist" package function "amatch" to approximately match words.
#'
#' @param WoSDT data.frame with bibliography records
#' @param searchVector vector of strings in which search will be performed.
#' Usually this vector represents data.frame column coerced to vector which
#' contains bibliography fields in which to search.
#' @param searchString string that contains words to search.
#'
#' @return
#' @export
#'
#' @examples
#' searchVector<-WoSDT$title
#' searchWords<-"value risk"
#' filterBibliographyByWords(WoSDT, searchVector, searchWords)
filterBibliographyByWords<-function(WoSDT, searchVector, searchString){
searchVector %<>% str_to_lower
searchString %<>% str_to_lower
searchWords<-words(searchString)
filter<-searchVector %>%
lapply(words) %>%
lapply(function(vtitle) {
amatch(searchWords, vtitle, maxDist = 2)
}) %>%
lapply(function(x){
ifelse(sum(!is.na(x))==length(x), T, F)
}) %>% unlist
WoSDT %>%
subset(filter)
}
# feed<-feed.extract("http://rss.sciencedirect.com/publication/science/10575219", encoding = integer())
# read_xml(feed %>% unlist)
# class(feed)
# txt<-feed %>% unlist
# df<-data.frame(title=txt, stringsAsFactors = F) %>% rownames_to_column(var="tag") %>%
# subset(title!="updated") %>%
# mutate(tag=str_replace_all(tag, pattern = "items.", replacement = "")) %>%
# subset(!is.na(str_match(tag, "title")))
#
# df$title
#ReadCrossRef(df$title[1], list(issn = "10575219", "from-pub-date" = 2018), limit = 3)
# df %>% head
# str_view(txt, pattern = " ")
# class(txt)
# names(txt)
# length(txt)
# txt[1:10]
# read_html(feed %>% unlist)
# feed[[2]]
# ?feed.extract
# parse.rdf(rss)
# parse.rdf(feed)
# dois<-dfWoS[1:2,]$doi
# url1<-"https://doi.org/"
# urls<-paste(url1, dois, sep="")
# GetBibEntryWithDOI(doi=c("10.1111/jofi.12188"),
# temp.file="~/wos_data/new_bib.bib",
# delete.file = F)
#
# GetBibEntryWithDOI(c("10.1016/j.iheduc.2003.11.004", "10.3998/3336451.0004.203"),
# temp.file = file.path(getwd(), "bibs.bib"), delete.file = F)
#