-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessing.R
132 lines (129 loc) · 4.76 KB
/
preprocessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
preprocessing <- function(data) {
scopus_cleaned <-
data |>
select(SR, CR) |>
na.omit() |>
separate_rows(CR, sep = "; ") |>
mutate(PY = str_extract(CR, "\\([0-9]{4}\\)"),
PY = str_remove_all(PY, "[\\(\\)]"),
PY = as.numeric(PY)) |>
na.omit() |>
mutate(AU = str_extract(CR, ".*\\([0-9]{4}\\)"),
AU = str_extract(AU, ".*\\.,"),
AU = gsub("([^,]+,[^,]+),", "\\1;", AU),
AU = str_sub(AU, 1, nchar(AU)-1),
AU = str_replace_all(AU,
pattern = "; ",
replacement = ";"),
AU = str_remove_all(AU, pattern = "\\."),
AU = str_remove_all(AU, pattern = ",")) |>
mutate(type_ref = if_else(str_detect(CR,
"\\., \\("), 2, # books
if_else(str_detect(CR,
"^\\([0-9]{4}\\)"), 3,
if_else(str_detect(CR,
" \\([0-9]{4}\\), "), 4,
1)))) |> # papers
mutate(TI = if_else(type_ref == 1,
str_extract(CR,
".*\\([0-9]{4}\\)"),
CR)) |>
mutate(TI = if_else(type_ref == 1,
str_remove(TI, "\\([0-9]{4}\\)"),
TI)) |>
mutate(TI = if_else(type_ref == 1,
str_remove(TI, ".*\\., "),
TI)) |>
mutate(TI = if_else(type_ref == 1,
str_trim(TI),
TI)) |>
mutate(TI = if_else(type_ref == 2,
str_extract(CR,
"\\([0-9]{4}\\).*"),
TI)) |>
mutate(TI = if_else(type_ref == 2,
str_remove(TI, "\\([0-9]{4}\\)"),
TI)) |>
mutate(TI = if_else(type_ref == 2,
str_remove(TI, ", [0-9].*"),
TI)) |>
mutate(TI = if_else(type_ref == 2,
str_trim(TI),
TI)) |>
mutate(TI = if_else(type_ref == 3,
str_remove(CR, "\\([0-9]{4}\\)"),
TI)) |>
mutate(TI = if_else(type_ref == 3,
str_remove(TI, ", ,.*"),
TI)) |>
mutate(TI = if_else(type_ref == 3,
str_trim(TI),
TI)) |>
mutate(TI = if_else(type_ref == 4,
str_extract(CR,
".* \\([0-9]{4}\\) "),
TI)) |>
mutate(TI = if_else(type_ref == 4,
str_remove(TI, "\\([0-9]{4}\\)"),
TI)) |>
mutate(TI = if_else(type_ref == 4,
str_remove(TI, ".*\\., "),
TI)) |>
mutate(TI = if_else(type_ref == 4,
str_trim(TI),
TI)) |>
mutate(JI = if_else(type_ref == 1,
str_remove(CR, ".*\\([0-9]{4}\\)"),
CR)) |>
mutate(JI = if_else(type_ref == 1,
str_remove(JI, ", .*"),
JI)) |>
mutate(JI = if_else(type_ref == 1,
str_trim(JI),
JI)) |>
filter(type_ref == 1) |>
filter(JI != "") |>
mutate(JI = str_remove_all(JI, "\\.")) |>
mutate(SR_ref = gsub("^(.*?);.*", "\\1", AU),
SR_ref = str_c(SR_ref, ", ", PY, ", ", JI, sep = "")) |>
add_column(DE = NA, .after = "AU",
ID = NA,
C1 = NA,
AB = NA,
PA = NA,
AR = NA,
chemicals_cas = NA,
coden = NA,
RP = NA,
DT = NA,
DI = NA,
BE = NA,
FU = NA,
BN = NA,
SN = NA,
SO = NA,
LA = NA,
TC = NA,
PN = NA,
page_count = NA,
PP = NA,
PU = NA,
PM = NA,
DB = NA,
sponsors = NA,
url = NA,
VL = NA,
FX = NA,
AU_UN = NA,
AU1_UN = NA,
AU_UN_NR = NA,
SR_FULL = NA) |>
select(AU, DE, ID, C1, CR, JI, AB, PA, AR, chemicals_cas,
coden, RP, DT, DI, BE, FU, BN, SN, SO,
LA, TC, PN, page_count, PP, PU, PM, DB, sponsors, url,
VL, PY, FX, AU_UN, AU1_UN, AU_UN_NR, SR_FULL, SR, SR_ref)
preprossed_data <-
data |>
bind_rows(scopus_cleaned )
return(preprossed_data)
}