-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathedges_in_webpages.R
73 lines (68 loc) · 2.32 KB
/
edges_in_webpages.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#find connections between items in QS ecosystem
#by searching thorough web pages
#load file
#reload all data or just find unparsed links?
#
#for both in and out
#for each item with links to do
# separate string into char vec of links
# load char vec of pages
# for each line get name and syns
# for each page
# add line's detected to vector
# save vector to cell
#
#save file
#
#### load file get links ####
require(readr)
Cs <- read_delim("Connections.csv",
";", escape_double = FALSE, trim_ws = TRUE)
Cs <- as.data.frame(Cs)
lix <- (!is.na(Cs$Link_In) & is.na(Cs$Automatic_In) & (Cs$Automatic_In != "-"))
lox <- (!is.na(Cs$Link_Out) & is.na(Cs$Automatic_Out) & (Cs$Automatic_Out != "-"))
search_again <- F
if(search_again){
lix <- (!is.na(Cs$Link_In) & (Cs$Automatic_In != "-"))
lox <- (!is.na(Cs$Link_Out) & (Cs$Automatic_Out != "-"))
}
#alsyn<-paste(Cs$Synonyms,sep=",",collapse = ",")
#alsyn<-paste(paste(Cs$Name,collapse=","),alsyn,sep=",")
#alsyn<-unique(alsyn)
#### for each in link####
require(rvest)
require(stringr)
for (l in which(lix)) {#l<-26
pagel <- str_split(Cs$Link_In[l],",")
gethtml <- Vectorize(function(pgl){html_text(read_html(pgl))},vectorize.args ="pgl")
pge <- gethtml(pagel[[1]])
pge <- str_conv(pge,encoding = "UTF-8")
foundstrng <- vector()
for(i in 1:dim(Cs)[1]){ #i<-1
eachstring <- c(Cs$Name[i],str_split(Cs$Synonyms[i],",")[[1]])
eachstring <- eachstring[!is.na(eachstring)]
for (hld in pge) {
#print(which(pge==hld))
if(any(str_detect(hld, eachstring))) foundstrng <- c(foundstrng,Cs$Name[i])
}
}
Cs$Automatic_In[l]<-paste(unique(foundstrng),collapse = ",")
}
for (l in which(lox)) {#l<-26
pagel <- str_split(Cs$Link_Out[l],",")
gethtml <- Vectorize(function(pgl){html_text(read_html(pgl))},vectorize.args ="pgl")
pge <- gethtml(pagel[[1]])
pge <- str_conv(pge,encoding = "UTF-8")
foundstrng <- vector()
for(i in 1:dim(Cs)[1]){ #i<-1
eachstring <- c(Cs$Name[i],str_split(Cs$Synonyms[i],",")[[1]])
eachstring <- eachstring[!is.na(eachstring)]
for (hld in pge) {
#print(which(pge==hld))
if(any(str_detect(hld, eachstring))) foundstrng <- c(foundstrng,Cs$Name[i])
}
}
Cs$Automatic_Out[l]<-paste(unique(foundstrng),collapse = ",")
}
#### save ####
write.table(Cs,file="Connections.csv",quote = F,sep = ";",na="",row.names = F)