MarineGEO_data_scripts.Rmd

---
title: "MarineGEO Data Formatting, Summarizing and joining"
author: "Eric D. Crandall"
date: "5/29/2017"
output: 
  html_notebook:
    toc: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
	echo = TRUE,
	message = FALSE,
	warning = FALSE
)
library(plyr)
library(dplyr)
library(ggmap)
library(readxl)
library(lubridate)
library(stringr)
filedir<-"~/google_drive/MarineGEO"

bbox<-c(left=-157.85,bottom= 21.36,right=-157.75, top=21.55)
dmap <- get_map(location = bbox, maptype = "satellite", source = "google")

# to do:

# split scientific name into genus and species where appropriate
# sp. changes lowest known to genus
# organismRemarks == "Bishop" - change institutionID to "Bishop Museum"
# organismScope = bulk lot if individualCount > 1
# add on location info specific to Kaneohe Bay

```
# Build Events Database

## Fish
```{r Fish Events}

fish_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Fish/FISH_DATA_21Jul2017.xlsx",sheet="Events")

### Format events to schema

#change all empty columns from logical to character class
fish_events[sapply(fish_events, is.logical)] <- lapply(fish_events[sapply(fish_events, is.logical)],  as.character)

#change eventID to just the KANF value
fish_events$eventID<-str_extract(fish_events$eventID,"KANF\\d\\d\\d")

#round the lat/longs to four digits (~ 10m uncertainty)
fish_events$decimalLatitude<-round(as.numeric(fish_events$decimalLatitude),digits = 4)
fish_events$decimalLongitude<-round(as.numeric(fish_events$decimalLongitude),digits = 4)

fish_events$coordinateUncertaintyInMeters<-as.numeric(fish_events$coordinateUncertaintyInMeters) 

# class(fish_events$geoReferenceProtocol)
# class(fish_events$maximumDepthInMeters)
# class(fish_events$minimumDepthInMeters)
# class(fish_events$recordedBy) 
# class(fish_events$samplingProtocol) #some missing. ask Diane to fill this in
# class(fish_events$habitatBiotic)

#remove carriage returns from Geomorphological Zone
fish_events$habitatGeomorphologicalZone<-str_replace_all(fish_events$habitatGeomorphologicalZone,pattern="\r\n",replacement = "")
fish_events$habitatSubstrate<-str_replace_all(fish_events$habitatSubstrate,pattern="\r\n",replacement = "")

fish_events$year<-as.numeric(fish_events$year)
fish_events$month<-match(fish_events$month,month.abb)
fish_events$day<-as.numeric(fish_events$day)

#Recommended fields
# class(fish_events$eventRemarks)
# class(fish_events$locality)
# class(fish_events$eventRemarks)
# class(fish_events$verbatimCoordinates)
# class(fish_events$eventMedia)

#remove ending times from eventTime field
fish_events$eventTime<-str_extract(string=fish_events$eventTime,pattern="^\\d\\d:\\d\\d")

```

## Algae

```{r Algae Events}

algae_events<-read_excel("/Users/eric/google_drive/MarineGEO/algae/MarineGEOHI_bioassessment_master_KANA.xlsx", sheet="Station")

### Format events to schema

#change all empty columns from logical to character class
algae_events[sapply(algae_events, is.logical)] <- lapply(algae_events[sapply(algae_events, is.logical)],  as.character)


#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative

algae_events$decimalLatitude<-round(as.numeric(algae_events$decimalLatitude),digits = 4)
algae_events$decimalLongitude<-abs(round(as.numeric(algae_events$decimalLongitude),digits = 4))*-1

# class(algae_events$eventID)
# class(algae_events$geoReferenceProtocol) #good
# class(algae_events$coordinateUncertaintyInMeters) #good
# class(algae_events$maximumDepthInMeters) #some missing. ask Melinda to fill this in
# class(algae_events$minimumDepthInMeters)#some missing. ask Melinda to fill this in
# class(algae_events$recordedBy) #some missing. ask Melinda to fill this in
# class(algae_events$samplingProtocol) #some missing. ask Melinda to fill this in
# class(algae_events$`habitatBiotic`)
# class(algae_events$`habitatGeomorphologicalZone`)
# class(algae_events$`habitatSubstrate`)


#Recommended fields
# class(algae_events$eventRemarks)
# class(algae_events$locality)
# class(algae_events$eventRemarks)
# class(algae_events$verbatimCoordinates)
# class(algae_events$eventMedia)

#format time correctly
algae_events$eventTime<-as.character(parse_date_time(algae_events$eventTime, orders="ymdHMS", tz="HST"),format="%H:%M")


```

## Macroinvertebrates

```{r Invert Events}

invert_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Inverts/BKANE_063017_FIMS.xlsx", sheet = "Station", skip=2)


### Format events to schema
# Note: invert event data need a lot of cleaning, including importing stations from the algae and meiofauna team, and making new stations with "A" or "B" appended. Currently 839 samples do not have station information because of this.

# remove event fields that don't appear in the schema
invert_events<-invert_events[,-grep(pattern = "^X",x = names(invert_events),perl=T)]

#change all empty columns from logical to character class
invert_events[sapply(invert_events, is.logical)] <- lapply(invert_events[sapply(invert_events, is.logical)],  as.character)

#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
invert_events$decimalLatitude<-round(as.numeric(invert_events$decimalLatitude),digits = 4)
invert_events$decimalLongitude<-abs(round(as.numeric(invert_events$decimalLongitude),digits = 4))*-1

invert_events$geoReferenceProtocol<-"GPS"
invert_events$coordinateUncertaintyInMeters<-100

# class(invert_events$maximumDepthInMeters) #some missing. ask John to fill this in
# class(invert_events$minimumDepthInMeters)#some missing. ask John to fill this in

#Format the date properly, get rid of the "raw" field
invert_events$day<-day(invert_events$eventDate)
invert_events$month<-month(invert_events$eventDate)
invert_events$year<-year(invert_events$eventDate)
invert_events$eventDate<-NULL

# class(invert_events$recordedBy) #good
# class(invert_events$samplingProtocol) #some missing. ask John to fill this in
# class(invert_events$habitatGeomorphologicalZone) #some missing. ask John to fill this in. #this needs to be aligned with the schema
# class(invert_events$habitatSubstrate) # some missing. ask John to fill this in. #this needs to be aligned with the schema


#Recommended fields
# class(invert_events$locality)
# class(invert_events$eventRemarks)

```

## Meiofauna

```{r Meio Events}

meio_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Meiofauna/Hawaii2017_meiofauna_MarineGEO.xlsx", sheet = "station data", skip=1)

#remove columns that don't appear in MarineGEO schema
meio_events<-meio_events[,-grep(pattern = "^X",x = names(meio_events),perl=T)]

### Format events to schema

#Missing station KANM087!!!

#change all empty columns from logical to character class
meio_events[sapply(meio_events, is.logical)] <- lapply(meio_events[sapply(meio_events, is.logical)],  as.character)

#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
meio_events$decimalLatitude<-round(as.numeric(meio_events$decimalLatitude),digits = 4)
meio_events$decimalLongitude<-abs(round(as.numeric(meio_events$decimalLongitude),digits = 4))*-1

meio_events$coordinateUncertaintyInMeters<-100
meio_events$minimumDepthInMeters<-as.numeric(meio_events$minimumDepthInMeters)
meio_events$maximumDepthInMeters<-as.numeric(meio_events$maximumDepthInMeters)

# class(meio_events$maximumDepthInMeters)
# class(meio_events$minimumDepthInMeters)
# class(meio_events$recordedBy) #ask Freya to follow format - add last names, and pipes between names
# class(meio_events$samplingProtocol) #some missing. ask Freya to fill this in
# class(meio_events$habitatGeomorphologicalZone) #some missing. ask Freya to fill this in
# class(meio_events$habitatSubstrate) # some missing. ask Freya to fill this in
# class(meio_events$habitatBiotic)

#Recommended fields
# class(meio_events$locality)
# class(meio_events$eventRemarks)

#format the $%*& date correctly
meio_events$eventDate<-as.Date(as.numeric(meio_events$eventDate), origin="1900/01/01")
meio_events$day<-day(meio_events$eventDate)
meio_events$month<-month(meio_events$eventDate)
meio_events$year<-year(meio_events$eventDate)
meio_events$eventDate<-NULL

#format the $^#&* time correctly
meio_events$eventTime<-format(.POSIXct(86400*as.numeric(meio_events$eventTime), "UTC"), "%H:%M")
#meio_events$eventTime<-as.character(parse_date_time(meio_events$eventTime, orders="ymdHMS", tz="HST"),format="%H:%M")

meio_events$eventMedia<-"N"

```

## ARMS

```{r ARMS events}
arms_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/ARMS/MarineGEOHI_bioassessment_master-ARMS.xlsx", sheet = "Station")

### Format events to schema

#change all empty columns from logical to character class
arms_events[sapply(arms_events, is.logical)] <- lapply(arms_events[sapply(arms_events, is.logical)],  as.character)


#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
arms_events$decimalLatitude<-round(as.numeric(arms_events$decimalLatitude),digits = 4)
arms_events$decimalLongitude<-abs(round(as.numeric(arms_events$decimalLongitude),digits = 4))*-1
arms_events$coordinateUncertaintyInMeters<-10

arms_events$minimumDepthInMeters<-as.numeric(arms_events$minimumDepthInMeters)

# class(arms_events$geoReferenceProtocol)
# class(arms_events$eventID)
# class(arms_events$maximumDepthInMeters)
# class(arms_events$recordedBy) #ask Laetitia to follow name format
# class(arms_events$samplingProtocol)
# class(arms_events$habitatGeomorphologicalZone)
# class(arms_events$habitatSubstrate) 
# class(arms_events$habitatBiotic)

#Recommended fields
# class(arms_events$locality)
# class(arms_events$year)
# class(arms_events$month)
# class(arms_events$day)

arms_events$eventMedia<-"Y" #still need to get these from Laetitia

```

## Visual Transects

```{r Transect Events}
trans_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/bioassessment_KANV.xlsx", sheet = "Station")

trans_event_photos<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/KANV_benthic-photos_filenames_20170711.xlsx", sheet = "Sheet1")

# event photos
#pop off the eventID into its own field
trans_event_photos$eventID<-sub(pattern="(KANV\\d\\d\\d)_.+",replacement = "\\1", trans_event_photos$eventMedia, perl=T)

#use ddply to lump all eventMedia into a single field, separated by a |
trans_eventMedia<-ddply(trans_event_photos, "eventID", transform, eventMedia = paste(eventMedia, collapse = "|"))

#keep only the first instance of each occurrenceID
trans_eventMedia<-trans_eventMedia[!duplicated(trans_eventMedia$eventID),]

#delete original eventMedia column and join on the new one
trans_events$eventMedia<-NULL
trans_events<-left_join(trans_events, trans_eventMedia, by="eventID")


### Format events to schema
#change all empty columns from logical to character class
trans_events[sapply(trans_events, is.logical)] <- lapply(trans_events[sapply(trans_events, is.logical)],  as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
trans_events$decimalLatitude<-round(as.numeric(trans_events$decimalLatitude),digits = 4)
trans_events$decimalLongitude<-abs(round(as.numeric(trans_events$decimalLongitude),digits = 4))*-1

trans_events$minimumDepthInMeters<-as.numeric(trans_events$minimumDepthInMeters)

# class(trans_events$eventID)
# class(trans_events$coordinateUncertaintyInMeters)
# class(trans_events$maximumDepthInMeters)
# class(trans_events$recordedBy) 
# class(trans_events$samplingProtocol)
# class(trans_events$habitatGeomorphologicalZone)
# class(trans_events$habitatSubstrate)
# class(trans_events$habitatBiotic)

#Recommended fields
# class(trans_events$locality)
# class(trans_events$year)
# class(trans_events$month)
# class(trans_events$day)

trans_events$eventTime<-as.character(parse_date_time(trans_events$eventTime, orders="ymdHMS", tz="HST"),format="%H:%M")

```

## Join Events

```{r Join Events}
events<-full_join(fish_events,algae_events)
events<-full_join(events,invert_events)
events<-full_join(events,meio_events)
events<-full_join(events,arms_events)
events<-full_join(events,trans_events)


```
# Build sample databases

## Fish

### Import
```{r Fish_import}
# First go through and make sure all stations have Lat/Longs, or as many as possible. Delete secondary lat/longs
#read in the sample data, skipping first 3 lines of other headers. Format all times as hh:mm in Excel, paste into textwrangler if they need homogenization (i.e. multiple formats of times)

fish_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Fish/FISH_DATA_21Jul2017.xlsx",sheet="Samples")

fish_genetic<-read_excel(path="/Users/eric/google_drive/MarineGEO/Fish/FISH_DATA_21Jul2017.xlsx",sheet="Genetic samples")
#these commands apply to the original fish-group template
#remove columns without mapped DwC terms mapped
#fish_samples<-fish_samples[,-grep(pattern = "^X",x = names(fish_samples),perl=T)]
#fish_events<-fish_events[,-grep(pattern = "^X",x = names(fish_events),perl=T)]

#remove records without occurrenceIDs (temp before final dataset)
#fish_samples<-fish_samples[-which(is.na(fish_samples$occurrenceID)),]
#translate fieldIDs to eventIDs
#fish_samples$eventID<-gsub("LRP 17-","KANF0",fish_samples$eventID)
#use ddply to lump all materialSampleIDs into a single field, separated by a |
#fish_samples<-ddply(fish_samples, "occurrenceID", transform, materialSampleID = #paste(materialSampleID, collapse = "|"))
#keep only the first instance of each occurrenceID
#fish_samples<-fish_samples[!duplicated(fish_samples$occurrenceID),]
```


### Format samples to schema

```{r format fish samples}
#change all empty columns from logical to character class
fish_samples[sapply(fish_samples, is.logical)] <- lapply(fish_samples[sapply(fish_samples, is.logical)],  as.character)

#Fix the eventIDs
fish_samples$eventID<-str_extract(fish_samples$eventID,"KANF\\d\\d\\d")


fish_samples$otherCatalogNumbers<-NULL # drop this for now - it will be replaced by fish_biorep below

# class(fish_samples[,which(sapply(fish_samples, is.logical))])<-"character"
# class(fish_samples$occurrenceID)
# class(fish_samples$basisofRecord)
# class(fish_samples$catalogNumber)<-"character"
# class(fish_samples$organismScope)
# class(fish_samples$eventID)
# class(fish_samples$scientificName) #eventually parse this into taxon categories?
# class(fish_samples$taxonRank) #using this
# class(fish_samples$individualCount)
# class(fish_samples$institutionID)

# make identifiedBy go firstname lastname
fish_samples$identifiedBy<-str_replace(fish_samples$identifiedBy, pattern="(\\w+), (.+)", replacement="\\2 \\1")

fish_samples$catalogNumber<-as.character(fish_samples$catalogNumber)

```

### Format the biorepository info
```{r fish biorepository}
colnames(fish_genetic)[5]<-"BiorepositoryID"
colnames(fish_genetic)[6]<-"tissueNotes"
colnames(fish_genetic)[2]<-"occurrenceID"
#use ddply to lump all materialSampleIDs into a single field, separated by a |
fish_biorep<-ddply(fish_genetic, "occurrenceID", transform, otherCatalogNumbers = paste(BiorepositoryID, collapse = "|"), tissueNotes = paste(tissueNotes, collapse="|"))
#keep only the first instance of each occurrenceID
fish_biorep<-fish_biorep[!duplicated(fish_biorep$otherCatalogNumbers),]
fish_biorep$otherCatalogNumbers<-as.character(fish_biorep$otherCatalogNumbers)
```

### Join the event data onto the occurrence data

```{r fish join}

#first join the biorep numbers to this data
fish_samples<-left_join(fish_samples,fish_biorep[,c(2,6,7)],by="occurrenceID")
#now join samples and events
fish<-left_join(fish_samples,events,by="eventID")


```

### Make a Map

```{r fish maps}
#optionally make it from the points provided
#bbox<-make_bbox(lon=fish2$decimalLongitude,lat=fish2$decimalLatitude)
#by individual
fish2<-fish %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(count=n()) 

#by species
fish3 <- fish %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())

#richness
fish4<-fish3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())

fish6<-left_join(fish2,fish4)


fish_map<-ggmap(dmap) + geom_point(data = fish6, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=count, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Species Richness/Site",order=1), size=guide_legend(title="Individual Count/Site"),order=2) + theme(axis.title=element_blank(), axis.text=element_text(color="black",size = 12), plot.title = element_text(hjust=0.5)) + geom_segment(arrow=arrow(length=unit(4,"mm"), type="closed", angle=40), 
               aes(x=-157.75,xend=-157.75,y=21.495,yend=21.5), colour=hcl(240,50,80)) + geom_label(aes(x=-157.75, y=21.495, label="N"),
             size=3, label.padding=unit(1,"mm"), label.r=unit(0.4,"lines")) + ggtitle(label="Fishes")

fish_map


ggsave(fish_map,filename="./output/fish_map.pdf")

```


## Algae

### Import
```{r Algae import}
#edit eventIDs for capitalization

algae_samples<-read_excel("/Users/eric/google_drive/MarineGEO/algae/MarineGEOHI_bioassessment_master_KANA.xlsx", sheet="Sample")

```

### Format samples to schema

```{r format algae samples}
#change all empty columns from logical to character class
algae_samples[sapply(algae_samples, is.logical)] <- lapply(algae_samples[sapply(algae_samples, is.logical)],  as.character)

# class(algae_samples$occurrenceID)
# class(algae_samples$catalogNumber)
# class(algae_samples$otherCatalogNumbers)
# class(algae_samples$organismScope)
# class(algae_samples$eventID)
# class(algae_samples$scientificName) #eventually parse this into taxon categories?
# class(algae_samples$taxonRank) # Melinda needs to populate this...
# class(algae_samples$identifiedBy)
# class(algae_samples$individualCount)

algae_samples$basisofRecord<-"specimen"
algae_samples$institutionID<-"USNM"


```


### Join the event data onto the occurrence data

```{r join algae}

algae<-left_join(algae_samples,events,by="eventID")

```

### Make A Map
```{r algae maps}


#by individual
algae2<-algae %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(count=n()) 

#by species
algae3 <- algae[-which(algae$scientificName=="?"),] %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())

#richness
algae4<-algae3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())

algae5<-left_join(algae2,algae4)

#algae5<-algae5[which(is.na(algae5$decimalLatitude)),]
#algae5<-algae5[-which(algae5$scientificName=="?"),]

algae5[4,4]<-1
algae5[11,4]<-2
extra<-algae5[33,]
algae5<-algae5[-33,] #remove outlier


algae_map<-ggmap(dmap) + geom_point(data = algae5,  mapping = aes(x = decimalLongitude, y = decimalLatitude, size=count, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness/Site",order=1), size=guide_legend(title="Individual Count/Site"),order=2) + theme(axis.title=element_blank(), axis.text=element_text(color="black",size = 12), plot.title = element_text(hjust=0.5)) + geom_segment(arrow=arrow(length=unit(4,"mm"), type="closed", angle=40), 
               aes(x=-157.75,xend=-157.75,y=21.495,yend=21.5), colour=hcl(240,50,80)) + geom_label(aes(x=-157.75, y=21.495, label="N"),
             size=3, label.padding=unit(1,"mm"), label.r=unit(0.4,"lines")) + ggtitle(label="Algae")
#add outlier back in as a red dot size=30

algae_map

ggsave(algae_map,filename="./output/algae_map.pdf")
```

## Macroinvertebrates

### Import
```{r inverts import}
invert_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Inverts/BKANE_063017_FIMS.xlsx", sheet = "Specimen",skip=3)

#sponges will be loaded here, cleaned up, and then joined with the rest of the inverts
sponge_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Sponges/VICENTE_BIOBLITZ2017_SPONGEMETADATA_FIMS.xlsx", sheet = "Sheet1",skip=1)

#remove columns that don't appear in MarineGEO schema
invert_samples<-invert_samples[,-grep(pattern = "^X",x = names(invert_samples),perl=T)]
sponge_samples<-sponge_samples[,-grep(pattern = "^X",x = names(sponge_samples),perl=T)]

#remove records without occurrenceIDs (temp before final dataset)
invert_samples<-invert_samples[-which(is.na(invert_samples$scientificName)),]
sponge_samples<-sponge_samples[-which(is.na(sponge_samples$scientificName)),]

#change all empty columns from logical to character class
invert_samples[sapply(invert_samples, is.logical)] <- lapply(invert_samples[sapply(invert_samples, is.logical)],  as.character)


sponge_samples[sapply(sponge_samples, is.logical)] <- lapply(sponge_samples[sapply(sponge_samples, is.logical)],  as.character)

sponge_samples$phylum[which(is.na(sponge_samples$phylum))]<-"Porifera"
sponge_samples$eventID<-str_replace(sponge_samples$eventID,pattern="-", replacement="")


invert_samples<-full_join(invert_samples,sponge_samples)
```

### Taxonomize

This code will populate `taxonRank` with the lowest known taxonomic category

```{r Inverts Taxonomize}

invert_samples$taxonRank[which(!is.na(invert_samples$phylum))]<-"phylum"
invert_samples$taxonRank[which(!is.na(invert_samples$class))]<-"class"
invert_samples$taxonRank[which(!is.na(invert_samples$subclass))]<-"subclass"
invert_samples$taxonRank[which(!is.na(invert_samples$order))]<-"order"
invert_samples$taxonRank[which(!is.na(invert_samples$suborder))]<-"suborder"
invert_samples$taxonRank[which(!is.na(invert_samples$superfamily))]<-"superfamily"
invert_samples$taxonRank[which(!is.na(invert_samples$family))]<-"family"
invert_samples$taxonRank[which(!is.na(invert_samples$subfamily))]<-"subfamily"
invert_samples$taxonRank[which(!is.na(invert_samples$genus))]<-"genus"
invert_samples$taxonRank[which(!is.na(invert_samples$species))]<-"species"
invert_samples$taxonRank[grep("sp\\.",invert_samples$species)]<-"genus"


```


### Format samples to schema

```{r format invert samples}
#change all empty columns from logical to character class moved this up
#invert_samples[sapply(invert_samples, is.logical)] <- #lapply(invert_samples[sapply(invert_samples, is.logical)],  as.character)

# class(invert_samples$occurrenceID)
# class(invert_samples$scientificName) #eventually parse this into taxon categories?
# class(invert_samples$taxonRank) #using this

invert_samples$basisOfRecord<-"specimen"
invert_samples$organismScope<-"organism"

#remove eventIDs that have two possibilities, assume it is the first one for now
invert_samples$eventID<-gsub(pattern="(\\w) or \\w",replacement="\\1",x= invert_samples$eventID,perl=T)

#remove the A,B,C etc. from the ends of eventIDs until the IZ team fixes them up to be meaningful
invert_samples$eventID<-str_replace(string=invert_samples$eventID, pattern="(KAN\\w\\d\\d\\d)[A-Z]",replacement="\\1")

#fix arms eventIDs
invert_samples$eventID[grep(pattern="KANEO",invert_samples$eventID)]<-str_replace(string=invert_samples$eventID[grep(pattern="KANEO",invert_samples$eventID)],pattern="KANEO\\d(\\d\\d)", replacement="KANEO_2017_ARMS\\1")


invert_samples$identifiedBy<-"Paulay, Gustav" # for now. don't know if they kept this info
#assume that blank counts had 1 individual for now
invert_samples$individualCount[is.na(invert_samples$individualCount)]<-1
invert_samples$institutionID<-"FLMNH"


```

### Join the event data onto the occurrence data

```{r invert join}
invert<-left_join(invert_samples,events,by="eventID")
```

### Make A Map
```{r invert maps}
#by individual
invert2<-invert %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T)) 

#by species
invert3 <- invert %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())

#richness
invert4<-invert3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())

invert5<-left_join(invert2,invert4)


invert_map<-ggmap(dmap) + geom_point(data = invert5[-c(length(invert5$richness)-1,length(invert5$richness)),], mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness/Site",order=1), size=guide_legend(title="Individual Count/Site"),order=2) + theme(axis.title=element_blank(), axis.text=element_text(color="black",size = 12), plot.title = element_text(hjust=0.5)) + geom_segment(arrow=arrow(length=unit(4,"mm"), type="closed", angle=40), 
               aes(x=-157.75,xend=-157.75,y=21.495,yend=21.5), colour=hcl(240,50,80)) + geom_label(aes(x=-157.75, y=21.495, label="N"),
             size=3, label.padding=unit(1,"mm"), label.r=unit(0.4,"lines")) + ggtitle(label="Macroinvertebrates")

invert_map

ggsave(invert_map,filename="./output/invert_map.pdf")

#create a vector of the eventIDs not finding a match in the events database
bad_eventIDs<-invert$eventID[which(is.na(invert$decimalLatitude))]
```

### Make A Sponge Map

```{r sponge maps}
sponge<-invert[which(invert$phylum=="Porifera"),]
#by individual
sponge2<-sponge %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T)) 

#by species
sponge3 <- sponge %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())

#richness
sponge4<-sponge3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())

sponge5<-left_join(sponge2,sponge4)


sponge_map<-ggmap(dmap) + geom_point(data = sponge5[-c(61,62),], mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness/Site",order=1), size=guide_legend(title="Individual Count/Site"),order=2) + theme(axis.title=element_blank(), axis.text=element_text(color="black",size = 12), plot.title = element_text(hjust=0.5)) + geom_segment(arrow=arrow(length=unit(4,"mm"), type="closed", angle=40), 
               aes(x=-157.75,xend=-157.75,y=21.495,yend=21.5), colour=hcl(240,50,80)) + geom_label(aes(x=-157.75, y=21.495, label="N"),
             size=3, label.padding=unit(1,"mm"), label.r=unit(0.4,"lines")) + ggtitle(label="Sponges")

sponge_map

ggsave(sponge_map,filename="./output/sponge_map.pdf")

```

## Meiofauna

### Import
```{r import meiofauna}
#note meio data still need some cleaning, including importing some stations from inverts and fish, and fixing up eventIDs to have 3 digits instead of 2. There are 27 specimens that do not have station information because of this.

meio_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Meiofauna/Hawaii2017_meiofauna_MarineGEO.xlsx", sheet = "specimen data",skip=1)

#remove columns that don't appear in MarineGEO schema
meio_samples<-meio_samples[,-grep(pattern = "^X",x = names(meio_samples),perl=T)]

#add occurrence IDs to one investigators samples - this has been taken care of now
#meio_samples$occurrenceID[which(is.na(meio_samples$occurrenceID))]<-"UJ" 
#make occurrence IDs unique
#meio_samples$occurrenceID<-make.unique(meio_samples$occurrenceID, sep="_")


```


### Format samples to schema

```{r format meiofauna samples}
#change all empty columns from logical to character class
meio_samples[sapply(meio_samples, is.logical)] <- lapply(meio_samples[sapply(meio_samples, is.logical)],  as.character)

meio_samples$basisofRecord<-"specimen" #check with Frey that this is correct
meio_samples$institutionID<-"USNM"

# class(meio_samples$scientificName) #eventually parse this into taxon categories?
# class(meio_samples$taxonRank) #using this
# class(meio_samples$occurrenceID)
# class(meio_samples$catalogNumber)
# class(meio_samples$otherCatalogNumbers)
# class(meio_samples$organismScope)
# class(meio_samples$eventID)
# class(meio_samples$identifiedBy)
# class(meio_samples$individualCount)


```

### Join the event data onto the occurrence data

```{r meiofauna join}
meio<-left_join(meio_samples,meio_events,by="eventID")
```

### Make A Map
```{r meiofauna maps}
#by individual
meio2<-meio %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T)) 

#by species
meio3 <- meio %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())

#richness
meio4<-meio3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())

meio5<-left_join(meio2,meio4)


meio_map<-ggmap(dmap) + geom_point(data = meio5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness/Site",order=1), size=guide_legend(title="Individual Count/Site"),order=2) + theme(axis.title=element_blank(), axis.text=element_text(color="black",size = 12), plot.title = element_text(hjust=0.5)) + geom_segment(arrow=arrow(length=unit(4,"mm"), type="closed", angle=40), 
               aes(x=-157.75,xend=-157.75,y=21.495,yend=21.5), colour=hcl(240,50,80)) + geom_label(aes(x=-157.75, y=21.495, label="N"),
             size=3, label.padding=unit(1,"mm"), label.r=unit(0.4,"lines")) + ggtitle(label="Meiofauna")
meio_map

ggsave(meio_map,filename="./output/meio_map.pdf")
```

## ARMS

### Import
```{r import arms}
arms_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/ARMS/MarineGEOHI_bioassessment_master-ARMS.xlsx", sheet = "Sample")

```


### Format samples to schema

```{r format arms samples}
#change all empty columns from logical to character class
arms_samples[sapply(arms_samples, is.logical)] <- lapply(arms_samples[sapply(arms_samples, is.logical)],  as.character)

# class(arms_samples$occurrenceID)
# class(arms_samples$basisofRecord)
# class(arms_samples$catalogNumber)
# class(arms_samples$otherCatalogNumbers)
# class(arms_samples$organismScope) #were there some slurries too?
# class(arms_samples$eventID)
# class(arms_samples$scientificName) #eventually parse this into taxon categories?
# class(arms_samples$taxonRank) #using this
# class(arms_samples$identifiedBy)
# class(arms_samples$individualCount)

# change NAs for individualCount to 1 for now
arms_samples$individualCount[which(is.na(arms_samples$individualCount))]<-1

arms_samples$institutionID<-"USNM"

```

### Join the event data onto the occurrence data

```{r arms join}
arms<-left_join(arms_samples,arms_events,by="eventID")
```

### Make A Map
```{r arms maps}
#by individual
arms2<-arms %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T)) 

#by species
arms3 <- arms %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())

#richness
arms4<-arms3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())

arms5<-left_join(arms2,arms4)


arms_map<-ggmap(dmap) + geom_point(data = arms5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness/Site",order=1), size=guide_legend(title="Individual Count/Site"),order=2) + theme(axis.title=element_blank(), axis.text=element_text(color="black",size = 12), plot.title = element_text(hjust=0.5)) + geom_segment(arrow=arrow(length=unit(4,"mm"), type="closed", angle=40), 
               aes(x=-157.75,xend=-157.75,y=21.495,yend=21.5), colour=hcl(240,50,80)) + geom_label(aes(x=-157.75, y=21.495, label="N"),
             size=3, label.padding=unit(1,"mm"), label.r=unit(0.4,"lines")) + ggtitle(label="Autonomous Reef Monitoring Structure") 

arms_map

ggsave(arms_map,filename="./output/arms_map.pdf")
```


## Visual Transects

### Import
```{r transects import}

trans_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/bioassessment_KANV.xlsx", sheet = "Sample")

trans_organism_photos<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/KANV_photo-vouchers_20170712.xlsx", sheet = "Sheet1")
```

### Format, collapse and merge the media fields

### Organism photos
```{r transect photo fields}
# organism photos

## delete the field to be added later
trans_samples$associatedMedia<-NULL

#pop off the eventID again into its own field
trans_organism_photos$eventID<-sub(pattern="(KANV\\d\\d\\d)_.+",replacement = "\\1", trans_organism_photos$associatedMedia, perl=T)

#pop off the species name into its own field
trans_organism_photos$scientificName<-sub(pattern=".+_(\\d{8})_(\\w+-[a-z]+)_.+", replacement = "\\2", trans_organism_photos$associatedMedia,perl=T)
trans_organism_photos$scientificName<-sub(pattern="-", replacement=" ",x = trans_organism_photos$scientificName)

#pop off the initials of the collector, and replace with full name
trans_organism_photos$identifiedBy<-str_extract(pattern="ZF|RW", string=trans_organism_photos$associatedMedia)
trans_organism_photos$identifiedBy[which(trans_organism_photos$identifiedBy=="ZF")]<-"Zach Foltz"
trans_organism_photos$identifiedBy[which(trans_organism_photos$identifiedBy=="RW")]<-"Ross Whippo"

#use ddply to lump all associatedMedia into a single field, separated by a |
trans_associatedMedia<-ddply(trans_organism_photos, c("eventID","scientificName","identifiedBy"), transform, associatedMedia = paste(associatedMedia, collapse = "|"))

#remove all but the first instance
trans_associatedMedia<-trans_associatedMedia[!duplicated(trans_associatedMedia$associatedMedia),]

trans_samples<-left_join(trans_samples, trans_associatedMedia, by=c("eventID","scientificName","identifiedBy"))

```

### Format samples to schema

```{r format transect samples}
#change all empty columns from logical to character class
trans_samples[sapply(trans_samples, is.logical)] <- lapply(trans_samples[sapply(trans_samples, is.logical)],  as.character)

# class(trans_samples$occurrenceID)
# class(trans_samples$basisofRecord)
# class(trans_samples$catalogNumber)
# class(trans_samples$otherCatalogNumbers)
# class(trans_samples$organismScope)
# class(trans_samples$eventID)
# class(trans_samples$scientificName) #eventually parse this into taxon categories?
# class(trans_samples$taxonRank) #using this
# class(trans_samples$identifiedBy)
# class(trans_samples$individualCount)

trans_samples$institutionID<-"USNM"


```

### Join the event data onto the occurrence data

```{r join transects}
trans<-left_join(trans_samples,trans_events,by="eventID")
```

### Make A Map
```{r transect maps}
#by individual
trans2<-trans %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T)) 

#by species
trans3 <- trans %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n(), individualCount=sum(individualCount,na.rm=T))

#richness
trans4<-trans3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())

trans5<-left_join(trans2,trans4)


trans_map<-ggmap(dmap) + geom_point(data = trans5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness/Site",order=1), size=guide_legend(title="Individual Count/Site"),order=2) + theme(axis.title=element_blank(), axis.text=element_text(color="black",size = 12), plot.title = element_text(hjust=0.5)) + geom_segment(arrow=arrow(length=unit(4,"mm"), type="closed", angle=40), 
               aes(x=-157.75,xend=-157.75,y=21.495,yend=21.5), colour=hcl(240,50,80)) + geom_label(aes(x=-157.75, y=21.495, label="N"),
             size=3, label.padding=unit(1,"mm"), label.r=unit(0.4,"lines")) + ggtitle(label="Visual Transects")

trans_map

ggsave(trans_map,filename="./output/trans_map.pdf")
```

# Final Database

Now to join everything into one monster database

```{r}
a<-full_join(fish,algae)
b<-full_join(a,invert)
c<-full_join(b,meio)
d<-full_join(c,arms)
MarineGEOHI<-full_join(d,trans)

#read in the flat schema
flat_schema<-as.vector(read.csv("flat_schema.csv", header=F, stringsAsFactors = F))

#reduce the joined database to just the columns in the schema, and sort on column order
MarineGEOHI2<-MarineGEOHI[,match(flat_schema, names(MarineGEOHI))]

write.csv(MarineGEOHI2,"./output/MarineGEOHI_data_1.1.csv",row.names = F)

#WriteXLS(c(MarineGEOHI2,as.data.frame(events)),ExcelFileName = "./output/MarineGEOHI_data_1.1.xlsx",SheetNames=c("Events","Samples"))

```