-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path.Rhistory
512 lines (512 loc) · 27.1 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
"red_loci_HCF" #reduce loci with only high confidence father assignments included
)
#read in these files as CSVs
par_scen_df <- list()
for(df in 1:length(par_scen_df_list)){
par_scen_df[[df]] <- read.csv(paste0("Data_Files/CSV_Files/", par_scen_df_list[[df]]))
}
print(par_scen_df)
#plot bar graphs of the offspring count for each mother and which individual is the father
png(paste0("Results/Figures/", full_scen[[1]], "_CF_permom.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[1]] %>%
ggplot() +
geom_bar(aes(y = sort(Candidate_father_ID))) +
facet_wrap(~`Mother_ID`) +
scale_x_continuous(n.breaks = 9) +
labs(title = "Count of Offspring per Candidate Father and Maternal Tree Pairs",
y = "Candidate Father ID", x = "Count of Offspring")
dev.off()
png(paste0("Results/Figures/", full_scen[[2]], "_CF_permom.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[2]] %>%
ggplot() +
geom_bar(aes(y = sort(Candidate_father_ID))) +
facet_wrap(~`Mother_ID`) +
scale_x_continuous(n.breaks = 9) +
labs(title = "Count of Offspring per Candidate Father and Maternal Tree Pairs",
y = "Candidate Father ID", x = "Count of Offspring")
dev.off()
png(paste0("Results/Figures/", full_scen[[3]], "_CF_permom.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[3]] %>%
ggplot() +
geom_bar(aes(y = sort(Candidate_father_ID))) +
facet_wrap(~`Mother_ID`) +
scale_x_continuous(n.breaks = 9) +
labs(title = "Count of Offspring per Candidate Father and Maternal Tree Pairs",
y = "Candidate Father ID", x = "Count of Offspring")
dev.off()
png(paste0("Results/Figures/", full_scen[[4]], "_CF_permom.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[4]] %>%
ggplot() +
geom_bar(aes(y = sort(Candidate_father_ID))) +
facet_wrap(~`Mother_ID`) +
scale_x_continuous(n.breaks = 9) +
labs(title = "Count of Offspring per Candidate Father and Maternal Tree Pairs",
y = "Candidate Father ID", x = "Count of Offspring")
dev.off()
###### Boxplot of distances between parents
png(paste0("Results/Figures/", full_scen[[1]], "_dist_par.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[1]] %>%
group_by(c(Mother_ID)) %>% #
ggplot(aes(x = fct_rev(fct_infreq(Mother_ID)), y = dist_par)) +
expand_limits(y = c(0, 650)) + # set limits for graph
#theme_minimal() + # set theme
theme_bw() + # set theme
geom_boxplot(fill="darkolivegreen4") +
geom_jitter(color = "grey", fill = "black", width = 0.3) +
geom_text(data = . %>% count(Mother_ID), aes(label = n, y = 645), vjust = -0.5) +
xlab("Maternal Tree ID") + ylab("Distance between parents (m)")
dev.off()
png(paste0("Results/Figures/", full_scen[[2]], "_dist_par.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[2]] %>%
group_by(c(Mother_ID)) %>% #
ggplot(aes(x = fct_rev(fct_infreq(Mother_ID)), y = dist_par)) + #This is making me think that I actually grouped it by count of occurrences lowest to highest instead of highest average distance between parents. As we discussed with Sean, the order is not that important.
expand_limits(y = c(0, 650)) + # set limits for graph
#theme_minimal() + # set theme
theme_bw() + # set theme
geom_boxplot(fill="darkolivegreen4") +
geom_jitter(color = "grey", fill = "black", width = 0.3) +
geom_text(data = . %>% count(Mother_ID), aes(label = n, y = 645), vjust = -0.5) +
xlab("Maternal Tree ID") + ylab("Distance between parents (m)")
dev.off()
png(paste0("Results/Figures/", full_scen[[3]], "_dist_par.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[3]] %>%
group_by(c(Mother_ID)) %>% #
ggplot(aes(x = fct_rev(fct_infreq(Mother_ID)), y = dist_par)) + #This is making me think that I actually grouped it by count of occurrences lowest to highest instead of highest average distance between parents. As we discussed with Sean, the order is not that important.
expand_limits(y = c(0, 650)) + # set limits for graph
#theme_minimal() + # set theme
theme_bw() + # set theme
geom_boxplot(fill="darkolivegreen4") +
geom_jitter(color = "grey", fill = "black", width = 0.3) +
geom_text(data = . %>% count(Mother_ID), aes(label = n, y = 645), vjust = -0.5) +
xlab("Maternal Tree ID") + ylab("Distance between parents (m)")
dev.off()
png(paste0("Results/Figures/", full_scen[[4]], "_dist_par.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[4]] %>%
group_by(c(Mother_ID)) %>% #
ggplot(aes(x = fct_rev(fct_infreq(Mother_ID)), y = dist_par)) + #This is making me think that I actually grouped it by count of occurrences lowest to highest instead of highest average distance between parents. As we discussed with Sean, the order is not that important.
expand_limits(y = c(0, 650)) + # set limits for graph
#theme_minimal() + # set theme
theme_bw() + # set theme
geom_boxplot(fill="darkolivegreen4") +
geom_jitter(color = "grey", fill = "black", width = 0.3) +
geom_text(data = . %>% count(Mother_ID), aes(label = n, y = 645), vjust = -0.5) +
xlab("Maternal Tree ID") + ylab("Distance between parents (m)")
dev.off()
for(scen in 1:length(full_scen)){
par_scen_df[[scen]] <- par_scen_df[[scen]] %>%
mutate(`Parents Are` = case_when(Half_Sibs == FALSE ~ "Not Half Siblings",
TRUE ~ "Half Siblings",
NA ~ "No Accession Number")
)
}
#graph of half-sibling matings group by maternal ID
png(paste0("Results/Figures/", full_scen[[1]], "_half_sib_dist.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[1]] %>%
group_by(`Parents Are`) %>% # group by half siblings to compare the status
ggplot(aes(x = Mother_ID, y = dist_par, color = `Parents Are`)) +
geom_jitter(width = 0.2) +
expand_limits(y = c(0, 650)) + # set limits for graph
scale_color_manual(values = c("cadetblue", "navy")) +
xlab("Maternal Tree ID") + ylab("Distance between parents (m)") +
theme_bw()
dev.off()
png(paste0("Results/Figures/", full_scen[[2]], "_half_sib_dist.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[2]] %>%
group_by(`Parents Are`) %>% # group by half siblings to compare the status
ggplot(aes(x = Mother_ID, y = dist_par, color = `Parents Are`)) +
geom_jitter(width = 0.2) +
expand_limits(y = c(0, 650)) + # set limits for graph
scale_color_manual(values = c("cadetblue", "navy")) +
xlab("Maternal Tree ID") + ylab("Distance between parents (m)") +
theme_bw()
dev.off()
png(paste0("Results/Figures/", full_scen[[3]], "_half_sib_dist.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[3]] %>%
group_by(`Parents Are`) %>% # group by half siblings to compare the status
ggplot(aes(x = Mother_ID, y = dist_par, color = `Parents Are`)) +
geom_jitter(width = 0.2) +
expand_limits(y = c(0, 650)) + # set limits for graph
scale_color_manual(values = c("cadetblue", "navy")) +
xlab("Maternal Tree ID") + ylab("Distance between parents (m)") +
theme_bw()
dev.off()
png(paste0("Results/Figures/", full_scen[[4]], "_half_sib_dist.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[4]] %>%
group_by(`Parents Are`) %>% # group by half siblings to compare the status
ggplot(aes(x = Mother_ID, y = dist_par, color = `Parents Are`)) +
geom_jitter(width = 0.2) +
expand_limits(y = c(0, 650)) + # set limits for graph
scale_color_manual(values = c("cadetblue", "navy")) +
xlab("Maternal Tree ID") + ylab("Distance between parents (m)") +
theme_bw()
dev.off()
#loop to add hybrid status to data frame
for(scen in 1:length(full_scen)){
par_scen_df[[scen]] <- par_scen_df[[scen]] %>%
mutate(`Offspring Hybrid Status` = case_when(Hybrid_Status == FALSE ~ "Not Hybrid",
TRUE ~ "Hybrid"))
}
#####barplots of the percentage of hybrids
png(paste0("Results/Figures/", full_scen[[1]], "_hybrid_per.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[1]] %>%
ggplot(aes(x = fct_rev(fct_infreq(Mother_ID)),
y = dist_par,
fill = `Offspring Hybrid Status`)) +
geom_boxplot() +
scale_fill_manual(values = c("darkseagreen", "darkgreen")) +
expand_limits(y = c(0, 650)) + # set limits for graph
xlab("Maternal Tree ID") + ylab("Distance between parents (m)") +
theme_bw()
dev.off()
png(paste0("Results/Figures/", full_scen[[2]], "_hybrid_per.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[2]] %>%
ggplot(aes(x = fct_rev(fct_infreq(Mother_ID)),
y = dist_par,
fill = `Offspring Hybrid Status`)) +
geom_boxplot() +
scale_fill_manual(values = c("darkseagreen", "darkgreen")) +
expand_limits(y = c(0, 650)) + # set limits for graph
xlab("Maternal Tree ID") + ylab("Distance between parents (m)") +
theme_bw()
dev.off()
png(paste0("Results/Figures/", full_scen[[3]], "_hybrid_per.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[3]] %>%
ggplot(aes(x = fct_rev(fct_infreq(Mother_ID)),
y = dist_par,
fill = `Offspring Hybrid Status`)) +
geom_boxplot() +
scale_fill_manual(values = c("darkseagreen", "darkgreen")) +
expand_limits(y = c(0, 650)) + # set limits for graph
xlab("Maternal Tree ID") + ylab("Distance between parents (m)") +
theme_bw()
dev.off()
png(paste0("Results/Figures/", full_scen[[4]], "_hybrid_per.png"),
res = 600, width = 5000, height = 3500)
par_scen_df[[4]] %>%
ggplot(aes(x = fct_rev(fct_infreq(Mother_ID)),
y = dist_par,
fill = `Offspring Hybrid Status`)) +
geom_boxplot() +
scale_fill_manual(values = c("darkseagreen", "darkgreen")) +
expand_limits(y = c(0, 650)) + # set limits for graph
xlab("Maternal Tree ID") + ylab("Distance between parents (m)") +
theme_bw()
dev.off()
##create a list for
#table to present the candidate fathers
species_count_list <- list()
#loop to create species count lists for parent assignments
for(scen in 1:length(full_scen)){
#create a data frame
species_count_list[[scen]] <- as.data.frame(table(par_scen_df[[scen]]$Candidate_Father_Species))
#rename columns
names(species_count_list[[scen]]) <- c("Species", "Count")
#organize data frame
species_count_list[[scen]]$Species <- factor(species_count_list[[scen]]$Species,
levels=species_count_list[[scen]]$Species[order(-species_count_list[[scen]]$Count)])
}
#now plot species count table
png(paste0("Results/Figures/", full_scen[[1]], "_species_count.png"),
res = 600, width = 5000, height = 3500)
species_count_list[[1]] %>%
ggplot(aes(x = Species, y=Count))+
geom_bar(stat = "identity", fill = "darkgreen") +
labs(title="Count of Candidate Father Trees per Species",
x="Candidate Father Species") +
theme_bw()
dev.off()
png(paste0("Results/Figures/", full_scen[[2]], "_species_count.png"),
res = 600, width = 5000, height = 3500)
species_count_list[[2]] %>%
ggplot(aes(x = Species, y=Count))+
geom_bar(stat = "identity", fill = "darkgreen") +
labs(title="Count of Candidate Father Trees per Species",
x="Candidate Father Species") +
theme_bw()
dev.off()
png(paste0("Results/Figures/", full_scen[[3]], "_species_count.png"),
res = 600, width = 5000, height = 3500)
species_count_list[[3]] %>%
ggplot(aes(x = Species, y=Count))+
geom_bar(stat = "identity", fill = "darkgreen") +
labs(title="Count of Candidate Father Trees per Species",
x="Candidate Father Species") +
theme_bw()
dev.off()
png(paste0("Results/Figures/", full_scen[[4]], "_species_count.png"),
res = 600, width = 5000, height = 3500)
species_count_list[[4]] %>%
ggplot(aes(x = Species, y=Count))+
geom_bar(stat = "identity", fill = "darkgreen") +
labs(title="Count of Candidate Father Trees per Species",
x="Candidate Father Species") +
theme_bw()
dev.off()
library(tidyverse)
library(geosphere)
#set working directory
setwd("/Users/mikaelyevans/Documents/GitHub/USBGHybridAcornsREU2023")
#load in the tissue database, remove offspring which have no coordinates
UHA_db <- read.csv("Data_Files/CSV_Files/ARCHIVED_USBG_Hybrid_Acorn_Tissue_Database.csv")
#read in cleaned data file
UHA_score_clean_df <- read.csv("Data_Files/CSV_Files/UHA_score_clean_df.csv")
#load in parentage results
parentage_results <- read.csv("Data_Files/CSV_Files/UHA_full_parentage.csv")
###################################
# Reorganize data frames #
###################################
#remove individuals with too much missing data
UHA_clean_df <- UHA_db[UHA_db$Tissue_ID %in% UHA_score_clean_df$Tissue_ID,]
#remove offspring from the data frame
UHA_par_df <- UHA_clean_df %>%
filter(!is.na(Longitude) | !is.na(Latitude))
#update parentage analysis df
UHA_clean_par_df <- parentage_results %>%
rename(Father_ID = Candidate_father_ID) %>%
filter(!is.na(Father_ID))
###Create data frames with distances
#Create a column of the crosses of every parent individual
all_potential_combo <- crossing(UHA_par_df$Tissue_ID,
UHA_par_df$Tissue_ID)
colnames(all_potential_combo) <- c("Parent_1", "Parent_2")
#remove rows where the parents are the same individual (selfing)
potential_combo_dedup <- filter(all_potential_combo, Parent_1 != Parent_2)
#make columns to store distance between parents and parent species
potential_combo_dedup$dist <- NA
potential_combo_dedup$Parent_1_species <- NA
potential_combo_dedup$Parent_2_species <- NA
#loop to calculate distance between parents
for(d in 1:nrow(potential_combo_dedup)){
Parent_1 <- potential_combo_dedup$Parent_1[d]
Parent_2 <- potential_combo_dedup$Parent_2[d]
#access the original tissue database via the parents
Parent_1_Database_row <- filter(UHA_clean_df, Tissue_ID == Parent_1)
Parent_2_Database_row <- filter(UHA_clean_df, Tissue_ID == Parent_2)
#use distGeo to get distance in m between the lat long points of the 2 parents
potential_combo_dedup$dist[d] <- distGeo(c(Parent_1_Database_row$Longitude, Parent_1_Database_row$Latitude),
c(Parent_2_Database_row$Longitude, Parent_2_Database_row$Latitude))
#record the species of both parents
potential_combo_dedup$Parent_1_species[d] <- Parent_1_Database_row$Species
potential_combo_dedup$Parent_2_species[d] <- Parent_2_Database_row$Species
}
#reorganize the combination data frame to fix species naming conventions and to
#have a hybrid column
potential_combo_info <- potential_combo_dedup %>%
mutate(Parent_1_species = str_replace_all(Parent_1_species, "Quercus", "Q."),
Parent_2_species = str_replace_all(Parent_2_species, "Quercus", "Q.")) %>% #replace all instances of Quercus with Q. to ensure all species names are formatted the same
mutate(Parental_species_match = case_when(Parent_1_species == Parent_2_species ~ "Conspecific",
Parent_1_species != Parent_2_species ~ "Heterospecific")) #if the species of the two parents matches then they are conspecific, if not they are heterospecific
#list of maternal names
mom_IDs <- unique(parentage_results$Mother_ID)
#filter combintation df by the possible mothers
relevant_potential_combos <- potential_combo_info %>%
filter(Parent_1 %in% mom_IDs)
#replaces relevant_parentage_results
#combine all columns for distance analysis
par_results_df <- left_join(UHA_clean_par_df,
select(relevant_potential_combos,
c(Parent_1, Parent_2, dist)),
join_by(Mother_ID == Parent_1,
Father_ID == Parent_2))
############### Create data frames
# right now just taking the mean dist of successful dads vs possible dads
#create df that has the mean distance of the 5 closest real fathers to each
#maternal tree (without ties, only possible with slice_min, using
#that instead of top_n)
rf_mean_small_df <- par_results_df %>%
group_by(Mother_ID) %>%
slice_min(dist, n =5, with_ties = FALSE) %>%
summarise(Mean_smallest_real_dists = mean(dist,
na.rm=TRUE))
#same as above but doing 5 farthest dists
rf_mean_large_df <- par_results_df %>%
group_by(Mother_ID) %>%
slice_max(dist, n =5, with_ties = FALSE) %>%
summarise(Mean_largest_real_dists = mean(dist, na.rm=TRUE))
#create df that has the mean distance of the 5 closest possible
#fathers to each maternal tree (including those with less than 5 offspring)
#Note that I don't need to call distinct here because there is only
#one entry per mother/father combo
pf_mean_small_df <- relevant_potential_combos %>%
group_by(Parent_1, Parental_species_match) %>%
slice_min(dist, n =5, with_ties = FALSE) %>%
summarise(Mean_smallest_potential_dists = mean(dist,
na.rm=TRUE))
#same as abovebut with 5 farthest dists
pf_mean_large_df <- relevant_potential_combos %>%
group_by(Parent_1, Parental_species_match) %>%
slice_max(dist, n =5, with_ties = FALSE) %>%
summarise(Mean_largest_potential_dists = mean(dist,
na.rm=TRUE))
#summarize data by mean and min distance to real fathers and by proportion of
#offspring that are hybrids (with heterospecific fathers)
rf_df <- par_results_df %>%
group_by(Mother_ID) %>%
summarise(Mean_real_dist = mean(dist, na.rm=TRUE),
Min_real_dist = min(dist, na.rm=TRUE),
Max_real_dist = max(dist, na.rm=TRUE),
Prop_hybrids = mean(Hybrid, na.rm = TRUE)) %>%
left_join(., rf_mean_small_df, join_by(Mother_ID == Mother_ID)) %>% #add the Mean_smallest_dists data
left_join(., rf_mean_large_df, join_by(Mother_ID == Mother_ID)) #add the Mean_largest_dists data
# Make dataset ("potential_fathers_summary") containing all of the desired
#summarized categories from the possible combinations of mothers and fathers
#data (mean distance of potential fathers, minimum distance of potential fathers,
#mean distance of closest 5 potential fathers).
#NOTE: this is NOT based off of the offspring data meaning that the 5
#closest fathers will be unique and the mean distance to fathers will be
#unweighted (because we have no way to know potential fathers relative success
#(i.e. how many offspring they may produce)) summarize data from data
#with all combos of possible conspecific and heterospefic individuals with
#each real mom
pf_df <- relevant_potential_combos %>%
group_by(Parent_1, Parental_species_match) %>%
summarise(Mean_potential_dist = mean(dist, na.rm=TRUE),
Min_potential_dist = min(dist, na.rm=TRUE),
Max_potential_dist = max(dist, na.rm=TRUE)) %>%
left_join(., pf_mean_small_df, join_by(Parent_1 == Parent_1, Parental_species_match == Parental_species_match)) %>% #add the Mean_smallest_dists data
left_join(., pf_mean_large_df, join_by(Parent_1 == Parent_1, Parental_species_match == Parental_species_match)) %>% #add the Mean_largest_dists data
left_join(., select(rf_df, c(Mother_ID, Prop_hybrids)), join_by(Parent_1 == Mother_ID)) #add the proportion of hybrids had by each mother (from real_fathers_summary)
###create a data frame to save the adjusted R-squared and p-value
rf_hybrid_prop_dist_df <- matrix(nrow = 5, ncol = 2)
#save p-values and r-squared results
rf_hybrid_prop_dist_df[1,1] <- summary(lm(formula = Prop_hybrids~Mean_real_dist, data=rf_df))$adj.r.squared
rf_hybrid_prop_dist_df[1,2] <- summary(lm(formula = Prop_hybrids~Mean_real_dist, data=rf_df))$coefficients[2,4]
rf_hybrid_prop_dist_df[2,1] <- summary(lm(formula = Prop_hybrids~Min_real_dist, data=rf_df))$adj.r.squared
rf_hybrid_prop_dist_df[2,2] <- summary(lm(formula = Prop_hybrids~Min_real_dist, data=rf_df))$coefficients[2,4]
rf_hybrid_prop_dist_df[3,1] <- summary(lm(formula = Prop_hybrids~Max_real_dist, data=rf_df))$adj.r.squared
rf_hybrid_prop_dist_df[3,2] <- summary(lm(formula = Prop_hybrids~Max_real_dist, data=rf_df))$coefficients[2,4]
rf_hybrid_prop_dist_df[4,1] <- summary(lm(formula = Prop_hybrids~Mean_smallest_real_dists, data=rf_df))$adj.r.squared
rf_hybrid_prop_dist_df[4,2] <- summary(lm(formula = Prop_hybrids~Mean_smallest_real_dists, data=rf_df))$coefficients[2,4]
rf_hybrid_prop_dist_df[5,1] <- summary(lm(formula = Prop_hybrids~Mean_largest_real_dists, data=rf_df))$adj.r.squared
rf_hybrid_prop_dist_df[5,2] <- summary(lm(formula = Prop_hybrids~Mean_largest_real_dists, data=rf_df))$coefficients[2,4]
#add columns and row labels
rownames(rf_hybrid_prop_dist_df) <- c("Mean_Par_Dist", "Min_Par_Dist", "Max_Par_Dist",
"Mean_Smallest_Dist", "Mean_Largest_Dist")
colnames(rf_hybrid_prop_dist_df) <- c("R2", "p-value")
#write out plot comparing real fathers plot
png("Results/Pairwise_Distance_Analysis/rf_dist_par_prop.png", width = 900,
height = 600)
rf_df %>%
ggplot() +
geom_point(aes(y = Prop_hybrids, x = Mean_real_dist)) +
geom_smooth(aes(y = Prop_hybrids, x = Mean_real_dist), method='lm', formula= y~x) +
xlab("Mean Distance Between Parents (m)") + ylab("Proportion of Hybrid Offspring") +
theme_classic()
dev.off()
# The linear model for the mean distance of the shortest 5 distances of real fathers
#to a given mother shows no relationship between the distance of the closest
#5 realized fathers to a given mother and the proportion of offspring that were
#hybrids for that mother.
rf_df %>%
ggplot() +
geom_point(aes(y = Prop_hybrids, x = Mean_smallest_real_dists)) +
geom_smooth(aes(y = Prop_hybrids, x = Mean_smallest_real_dists),
method='lm', formula= y~x) +
xlab("Mean Distance Between Parents (m) - Shortest Distances") +
ylab("Proportion of Hybrid Offspring") +
theme_classic()
# The linear model for the mean distance of the largest 5 distances of real fathers
#to a given mother
rf_df %>%
#filter(Mean_largest_real_dists < 500) %>% #to remove the largest value individual if you want
ggplot() +
geom_point(aes(y = Prop_hybrids, x = Mean_largest_real_dists)) +
geom_smooth(aes(y = Prop_hybrids, x = Mean_largest_real_dists),
method='lm', formula= y~x) +
xlab("Mean Distance Between Parents (m) - Largest Distances") +
ylab("Proportion of Hybrid Offspring") +
theme_classic()
# note that when you remove the mom with the largest values (> 500),
#still significant
summary(lm(formula = Prop_hybrids~Mean_largest_real_dists,
data=filter(rf_df, Mean_largest_real_dists < 500)))
write.csv(rf_hybrid_prop_dist_df, "Results/Pairwise_Distance_Analysis/rf_hybrid_prop_dist.csv")
#####Potential fathers to a given mother df
###create a data frame to save the adjusted R-squared and p-value
pf_hybrid_prop_dist <- matrix(nrow = 6, ncol = 2)
# The linear model for the mean distance of a potential fathers to a given mother split by conspecific and heterospecific fathers. There is no relationship between the proportion of hybrid offspring and the mean distances of all potential het or con fathers.
pf_hybrid_prop_dist[1,1] <- summary(lm(formula = Prop_hybrids~Mean_potential_dist,
data=filter(pf_df,
Parental_species_match == "Conspecific")))$adj.r.squared
pf_hybrid_prop_dist[1,2] <- summary(lm(formula = Prop_hybrids~Mean_potential_dist,
data=filter(pf_df,
Parental_species_match == "Conspecific")))$coefficients[2,4]
#mean distance to potential fathers, conspecific
pf_hybrid_prop_dist[2,1] <- summary(lm(formula = Prop_hybrids~Mean_potential_dist, data=filter(pf_df, Parental_species_match == "Heterospecific")))$adj.r.squared
pf_hybrid_prop_dist[2,2] <- summary(lm(formula = Prop_hybrids~Mean_potential_dist, data=filter(pf_df, Parental_species_match == "Heterospecific")))$coefficients[2,4]
#The linear model for the smallest distance of a potential father to a given
#mother split by conspecific and heterospecific fathers.
#There is no relationship between the proportion of hybrid offspring and the
#minimum distances of all potential het or con fathers.
pf_hybrid_prop_dist[3,1] <- summary(lm(formula = Prop_hybrids~Min_potential_dist, data=filter(pf_df, Parental_species_match == "Conspecific")))$adj.r.squared
pf_hybrid_prop_dist[3,2]<- summary(lm(formula = Prop_hybrids~Min_potential_dist, data=filter(pf_df, Parental_species_match == "Conspecific")))$coefficients[2,4]
pf_hybrid_prop_dist[4,1] <- summary(lm(formula = Prop_hybrids~Min_potential_dist, data=filter(pf_df, Parental_species_match == "Heterospecific")))$adj.r.squared
pf_hybrid_prop_dist[4,2]<- summary(lm(formula = Prop_hybrids~Min_potential_dist, data=filter(pf_df, Parental_species_match == "Heterospecific")))$coefficients[2,4]
# The linear model for the mean distance of the shortest 5 distances of potential fathers to a given mother split by conspecific and heterospecific fathers. There is no relationship between the proportion of hybrid offspring and the 5 smallest distances of all potential het or con fathers.
pf_hybrid_prop_dist[5,1] <- summary(lm(formula = Prop_hybrids~Mean_smallest_potential_dists, data=filter(pf_df, Parental_species_match == "Conspecific")))$adj.r.squared
pf_hybrid_prop_dist[5,2] <- summary(lm(formula = Prop_hybrids~Mean_smallest_potential_dists, data=filter(pf_df, Parental_species_match == "Conspecific")))$coefficients[2,4]
pf_hybrid_prop_dist[6,1] <- summary(lm(formula = Prop_hybrids~Mean_smallest_potential_dists, data=filter(pf_df, Parental_species_match == "Heterospecific")))$adj.r.squared
pf_hybrid_prop_dist[6,2] <- summary(lm(formula = Prop_hybrids~Mean_smallest_potential_dists, data=filter(pf_df, Parental_species_match == "Heterospecific")))$coefficients[2,4]
#add rownames and colnames
rownames(pf_hybrid_prop_dist) <- c("Mean_Dist_Con", "Mean_Dist_Het", "Min_Dist_Con",
"Min_Dist_Het", "Max_Dist_Con", "Max_Dist_Het")
colnames(pf_hybrid_prop_dist) <- c("R2", "p-value")
write.csv(pf_hybrid_prop_dist, "Results/Pairwise_Distance_Analysis/pf_hybrid_prop_dist.csv")
set.seed(2024) #since we are using the sample function, set the seed for repeatability
#create the table that will hold the results of the for loop with 3 columns: Mom, Dad, and dist
parents_dists_table_full <- tibble(Mom = character(), Dad = character(), dist = character())
for(i in 1:length(unique(relevant_potential_combos$Parent_1))){
parents_dists_table_small <- tibble(Mom = character(), Dad = character(), dist = character()) #make a small version of the final table that will be overwritten with each new mom
mom <- unique(relevant_potential_combos$Parent_1)[i] #get the ID of the given mom
mom_specific_combos <- relevant_potential_combos %>%
filter(Parent_1 == mom) %>% #filter all possible combos to only those with the given mom as Parent_1
filter(Parental_species_match == "Conspecific") #filter all possible combos with the given mom to only those that are the same species as the given mom
possible_dads <- mom_specific_combos$Parent_2 #make a vector with a list of all of the possible dads by pulling the Parent_2 column of the mom_specific_combos df
dads <- sample(possible_dads, size=1000, replace=T) #randomly draw 1000 samples from all possible dads with replacement
#a for loop that will loop through each of the 1000 sampled dads will make and add a row for the parents_dists_table_small
for(x in 1:length(dads)){
dad <- dads[x] #get the ID of the given dad
parent_combo_row_for_table <- tibble(Mom = NA, Dad = NA, dist = NA) #creating table that will be a single row and will be overwritten during each iteration of the loop
parent_combo_row_for_table$Mom <- mom #add ID of the given mom to table
parent_combo_row_for_table$Dad <- dad #add ID of the given dad to table
parent_combo_row_for_table$dist <- filter(mom_specific_combos, Parent_1 == mom & Parent_2 == dad)$dist #add dist from mom to dad in table (by matching the mom and dad's IDs to the mom_specific_combos df)
parents_dists_table_small <- rbind(parents_dists_table_small, parent_combo_row_for_table) #bind the row of the given dad to the table of the given mom
}
parents_dists_table_full <- rbind(parents_dists_table_full, parents_dists_table_small) #bind the table of the given given to the table of all the moms
}
parentage_results_for_hists <- par_results_df %>%
rename(Mom = Mother_ID,
Dad = Father_ID) %>%
select(Mom, Dad, dist, Hybrid) %>%
filter(Hybrid == F) %>% #we are only looking at distance between conspecific pairs in this analysis (no hydrbid offspring)
filter(!is.na(dist)) #remove entries where dist is NA because some trees that we have tissue from do not have long/lat values
dist_hist <- ggplot() +
geom_histogram(data = parents_dists_table_full,
aes(x = dist, y = ..density.., fill = "theoretical"),
alpha = .5) +
geom_histogram(data = parentage_results_for_hists,
aes(x = dist, y = ..density.., fill = "real"),
alpha = .5) +
labs(fill = "data set") +
scale_fill_manual(values = c("theoretical" = "red", "real" = "blue")) +
#facet_wrap(~Mom) + # facet_wrap by Mom to see how each individual Maternal tree did compared to bootstrap
theme_classic()
##write out histogram
png("Results/Pairwise_Distance_Analysis/sim_real_dist_hist.png", width = 1000, height = 850)
dist_hist
dev.off()