Use R to retrieve data from NCEI#
#' R Code for Retrieving NCEI Data
#' Contains functions for retrieving WLE weekly monitoring data
#' from NCEI website and consolidating into one csv file for downstream
#' analysis.
#'
#' C. Kitchens
#' University of Michigan
#' Cooperative Institute for Great Lakes Research
#' August 2022
# 0 - Install Packages ----------------------------------------------------
# install.packages("httr")
# install.packages("lubridate")
# install.packages("tidyverse")
Import packages#
library(httr)
library(lubridate)
library(tidyverse)
Attaching package: 'lubridate'
The following object is masked from 'package:base':
date
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
Registered S3 method overwritten by 'rvest':
method from
read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.1.1 v purrr 0.3.2
v tibble 2.1.1 v dplyr 0.8.0.1
v tidyr 0.8.3 v stringr 1.4.0
v readr 1.3.1 v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x lubridate::as.difftime() masks base::as.difftime()
x lubridate::date() masks base::date()
x dplyr::filter() masks stats::filter()
x lubridate::intersect() masks base::intersect()
x dplyr::lag() masks stats::lag()
x lubridate::setdiff() masks base::setdiff()
x lubridate::union() masks base::union()
Set working directory#
# Generate file path to working directory based on OS in use. OS.type will
# return "unix" if on Mac and "windows" if on PC.
if(.Platform$OS.type == "unix") {
file_path <- file.path("/volumes",
"GoogleDrive",
"Shared drives",
"WE Stoichiometry paper",
"R",
fsep = "/")
} else {
file_path <- file.path("D:",
"masonl",
"_py",
"ncei",
fsep = "/")
}
setwd(file_path)
Read in NCEI Data Using API Call#
#' Obtain urls linking to NCEI data for both 2012-2018 data files and 2019 data file
path_2018 = 'https://www.nodc.noaa.gov/archive/arc0135/0187718/2.2/data/0-data/lake_erie_habs_field_sampling_results_2012_2018_v2.csv'
path_2019 = 'https://www.nodc.noaa.gov/archive/arc0152/0209116/1.1/data/0-data/lake_erie_habs_field_sampling_results_2019.csv'
path_2021 = 'https://www.nodc.noaa.gov/archive/arc0204/0254720/1.1/data/0-data/noaa-glerl-erie-habs-field-sampling-results-2020-2021.csv'
#' Use GET to make REST API call to website
file_2018 <- GET(url = path_2018)
file_2019 <- GET(url = path_2019)
file_2021 <- GET(url = path_2021)
#' Turn raw content into csvs
file_2018 <- content(file_2018, as = 'raw')
writeBin(file_2018, 'data/NCEI_data_2018.csv')
file_2019 <- content(file_2019, as = 'raw')
writeBin(file_2019, 'data/NCEI_data_2019.csv')
file_2021 <- content(file_2021, as = 'raw')
writeBin(file_2021, 'data/NCEI_data_2021.csv')
df_2018 <- read.csv(
file = 'data/NCEI_data_2018.csv',
header = TRUE, dec = ',',
strip.white = TRUE,
na.strings = c(""),
check.names = TRUE,
fileEncoding = 'WINDOWS-1252')
df_2019 <- read.csv(
file = 'data/NCEI_data_2019.csv',
header = TRUE,
dec = ',',
strip.white = TRUE,
na.strings = c(""),
check.names = TRUE,
fileEncoding = 'WINDOWS-1252')
df_2021 <- read.csv(
file = 'data/NCEI_data_2021.csv',
header = TRUE,
dec = ',',
strip.white = TRUE,
na.strings = c(""),
check.names = TRUE)
Use Set Operations to Identify Differences in Data Structure#
colnames(df_2018)
colnames(df_2019)
colnames(df_2021)
#' See what columns are absent across the various data frames
a <- names(df_2018)
b <- names(df_2019)
c <- names(df_2021)
setdiff(a, b)
setdiff(a, c)
setdiff(b, a)
setdiff(b, c)
setdiff(c, a)
setdiff(c, b)
#' Remove columns not shared in common and rename colnames
df_2018 <- df_2018 %>%
select(-Sample.Temperature...C.,-Urea..µg.N.L.,-Dissolved.Organic.Carbon..mg.L.) %>%
mutate(Date = mdy(Date))
df_2019 <- df_2019 %>%
mutate(Date = mdy(Date))
df_2021 <- df_2021 %>%
mutate(date = mdy(date))
colnames(df_2018) <- colnames(df_2021)
colnames(df_2019) <- colnames(df_2021)
#' Combine all data frames into a single data frame
df = rbind(df_2018, df_2019, df_2021)
#' Remove unnecessary columns, rename remaining, and create additional needed
#' columns. Note that as.numeric() function coerces any cell with a non-numeric
#' character (e.g. "-1.21") to an NA.
df2 <- df %>%
select(
-time,
-lat,
-lon,
-wind_speed,
-wave_height,
-sky,
-secchi_depth,
-ctd_temp,
-ctd_specific_conductivity,
-ctd_beam_attenuation,
-ctd_transmission,
-ctd_dissolved_oxygen,
-ctd_par,
-particulate_microcystin,
-dissolved_microcystin
) %>%
rename(
Date = date,
Site = station_name,
StationDepth_m = station_depth_m,
SampleDepth_m = sample_depth_m,
SampleDepth_cat = sample_depth_category,
Turbidity_NTU = turbidity,
PC_ugL = extracted_phycocyanin,
CHLA_ugL = extracted_chla,
TP_ugL = total_p,
TDP_ugL = total_dissolved_p,
SRP_ugL = soluble_reactive_p,
Ammonia_ugL = ammonia,
NitrateAndNitrite_mgL = nitrate_nitrite,
POC_mgL = particulate_organic_c,
PON_mgL = particulate_organic_n,
CDOM_400nmABS = cdom,
TSS_mgL = tss,
VSS_mgL = vss
) %>%
mutate(
Date = as.Date(Date, format = '%Y-%m-%d'),
StationDepth_m = as.numeric(StationDepth_m),
SampleDepth_m = as.numeric(SampleDepth_m),
Turbidity_NTU = as.numeric(Turbidity_NTU),
PC_ugL = as.numeric(PC_ugL),
CHLA_ugL = as.numeric(CHLA_ugL),
TP_ugL = as.numeric(TP_ugL),
TDP_ugL = as.numeric(TDP_ugL),
SRP_ugL = as.numeric(SRP_ugL),
Ammonia_ugL = as.numeric(Ammonia_ugL),
NitrateAndNitrite_mgL = as.numeric(NitrateAndNitrite_mgL),
POC_mgL = as.numeric(POC_mgL),
PON_mgL = as.numeric(PON_mgL),
CDOM_400nmABS = as.numeric(CDOM_400nmABS),
TSS_mgL = as.numeric(TSS_mgL),
VSS_mgL = as.numeric(VSS_mgL)
) %>%
#' For columns that have bdls entered as "<XXXX" or "bdl", R interprets
#' as string instead of numeric, so remove "<" symbol from those entries,
#' then convert entire columns to numeric objects.The result is a data
#' frame where entries below limit of detection (LOD) are set equal to that LOD.
#' For VSS, there is no recorded LOD, so "bdl" entries are set to NA.
mutate(
Date = ymd(Date),
TDP_ugL = as.numeric(gsub("<", "", TDP_ugL)),
SRP_ugL = as.numeric(gsub("<", "", SRP_ugL)),
Ammonia_ugL = as.numeric(gsub("<", "", Ammonia_ugL)),
NitrateAndNitrite_mgL = as.numeric(gsub("<", "", NitrateAndNitrite_mgL)),
VSS_mgL = as.numeric(gsub("bdl", "", VSS_mgL))
) %>%
mutate(
JulianDate_DDD = yday(Date),
PP_ugL = TP_ugL - TDP_ugL
) %>%
filter(
SampleDepth_cat == 'Surface',
Site %in% c('WE2', 'WE4', 'WE6', 'WE8', 'WE9', 'WE12', 'WE13', 'WE14', 'WE15', 'WE16'),
) %>%
select(
-SampleDepth_cat
)
#' Add leading 0 to stations less than 10 (e.g. WE2 becomes WE02)
df3 <- df2 %>%
mutate(Site = gsub('(WE)(\\d{1})$', 'WE0\\2', Site))
- 'Date'
- 'Site'
- 'Station.Depth..m.'
- 'Sample.Depth..m.'
- 'Sample.Depth..category.'
- 'Local.Time..Eastern.Time.Zone.'
- 'Latitude..decimal.deg.'
- 'Longitude..decimal.deg.'
- 'Wind.speed..knots.'
- 'Wave.Height..ft.'
- 'Sky'
- 'Secchi.Depth..m.'
- 'Sample.Temperature...C.'
- 'CTD.Temperature...C.'
- 'CTD.Specific.Conductivity..µS.cm.'
- 'CTD.Beam.Attenuation..m.1.'
- 'CTD.Tramission....'
- 'CTD.Dissolved.Oxygen..mg.L.'
- 'CTD.Photosynthetically.Active.Radiation..µE.m2.s.'
- 'Turbidity..NTU.'
- 'Particulate.Microcystin..µg.L.'
- 'Dissolved.Microcystin..µg.L.'
- 'Extracted.Phycocyanin..µg.L.'
- 'Extracted.Chlorophyll.a..µg.L.'
- 'Total.Phosphorus..µg.P.L.'
- 'Total.Dissolved.Phosphorus..µg.P.L.'
- 'Soluble.Reactive.Phosphorus..µg.P.L.'
- 'Ammonia..µg.N.L.'
- 'Nitrate...Nitrite..mg.N.L.'
- 'Urea..µg.N.L.'
- 'Particulate.Organic.Carbon..mg.L.'
- 'Particulate.Organic.Nitrogen..mg.L.'
- 'Dissolved.Organic.Carbon..mg.L.'
- 'Colored.Dissolved.Organic.Material.absorbance..m.1..at.400nm'
- 'Total.Suspended.Solids..mg.L.'
- 'Volatile.Suspended.Solids..mg.L.'
- 'Date'
- 'Site'
- 'Station.Depth..m.'
- 'Sample.Depth..m.'
- 'Sample.Depth..category.'
- 'Local.Time..Eastern.Time.Zone.'
- 'Latitude..decimal.deg.'
- 'Longitude..decimal.deg.'
- 'Wind.speed..knots.'
- 'Wave.Height..ft.'
- 'Sky'
- 'Secchi.Depth..m.'
- 'CTD.Temperature...C.'
- 'CTD.Specific.Conductivity..µS.cm.'
- 'CTD.Beam.Attenuation..m.1.'
- 'CTD.Tramission....'
- 'CTD.Dissolved.Oxygen..mg.L.'
- 'CTD.Photosynthetically.Active.Radiation..µE.m2.s.'
- 'Turbidity..NTU.'
- 'Particulate.Microcystin..µg.L.'
- 'Dissolved.Microcystin..µg.L.'
- 'Extracted.Phycocyanin..µg.L.'
- 'Extracted.Chlorophyll.a..µg.L.'
- 'Total.Phosphorus..µg.P.L.'
- 'Total.Dissolved.Phosphorus..µg.P.L.'
- 'Soluble.Reactive.Phosphorus..µg.P.L.'
- 'Ammonia..µg.N.L.'
- 'Nitrate...Nitrite..mg.N.L.'
- 'Particulate.Organic.Carbon..mg.L.'
- 'Particulate.Organic.Nitrogen..mg.L.'
- 'Colored.Dissolved.Organic.Material.absorbance..m.1..at.400nm'
- 'Total.Suspended.Solids..mg.L.'
- 'Volatile.Suspended.Solids..mg.L.'
- 'date'
- 'station_name'
- 'station_depth_m'
- 'sample_depth_m'
- 'sample_depth_category'
- 'time'
- 'lat'
- 'lon'
- 'wind_speed'
- 'wave_height'
- 'sky'
- 'secchi_depth'
- 'ctd_temp'
- 'ctd_specific_conductivity'
- 'ctd_beam_attenuation'
- 'ctd_transmission'
- 'ctd_dissolved_oxygen'
- 'ctd_par'
- 'turbidity'
- 'particulate_microcystin'
- 'dissolved_microcystin'
- 'extracted_phycocyanin'
- 'extracted_chla'
- 'total_p'
- 'total_dissolved_p'
- 'soluble_reactive_p'
- 'ammonia'
- 'nitrate_nitrite'
- 'particulate_organic_c'
- 'particulate_organic_n'
- 'cdom'
- 'tss'
- 'vss'
- 'Sample.Temperature...C.'
- 'Urea..µg.N.L.'
- 'Dissolved.Organic.Carbon..mg.L.'
- 'Date'
- 'Site'
- 'Station.Depth..m.'
- 'Sample.Depth..m.'
- 'Sample.Depth..category.'
- 'Local.Time..Eastern.Time.Zone.'
- 'Latitude..decimal.deg.'
- 'Longitude..decimal.deg.'
- 'Wind.speed..knots.'
- 'Wave.Height..ft.'
- 'Sky'
- 'Secchi.Depth..m.'
- 'Sample.Temperature...C.'
- 'CTD.Temperature...C.'
- 'CTD.Specific.Conductivity..µS.cm.'
- 'CTD.Beam.Attenuation..m.1.'
- 'CTD.Tramission....'
- 'CTD.Dissolved.Oxygen..mg.L.'
- 'CTD.Photosynthetically.Active.Radiation..µE.m2.s.'
- 'Turbidity..NTU.'
- 'Particulate.Microcystin..µg.L.'
- 'Dissolved.Microcystin..µg.L.'
- 'Extracted.Phycocyanin..µg.L.'
- 'Extracted.Chlorophyll.a..µg.L.'
- 'Total.Phosphorus..µg.P.L.'
- 'Total.Dissolved.Phosphorus..µg.P.L.'
- 'Soluble.Reactive.Phosphorus..µg.P.L.'
- 'Ammonia..µg.N.L.'
- 'Nitrate...Nitrite..mg.N.L.'
- 'Urea..µg.N.L.'
- 'Particulate.Organic.Carbon..mg.L.'
- 'Particulate.Organic.Nitrogen..mg.L.'
- 'Dissolved.Organic.Carbon..mg.L.'
- 'Colored.Dissolved.Organic.Material.absorbance..m.1..at.400nm'
- 'Total.Suspended.Solids..mg.L.'
- 'Volatile.Suspended.Solids..mg.L.'
- 'Date'
- 'Site'
- 'Station.Depth..m.'
- 'Sample.Depth..m.'
- 'Sample.Depth..category.'
- 'Local.Time..Eastern.Time.Zone.'
- 'Latitude..decimal.deg.'
- 'Longitude..decimal.deg.'
- 'Wind.speed..knots.'
- 'Wave.Height..ft.'
- 'Sky'
- 'Secchi.Depth..m.'
- 'CTD.Temperature...C.'
- 'CTD.Specific.Conductivity..µS.cm.'
- 'CTD.Beam.Attenuation..m.1.'
- 'CTD.Tramission....'
- 'CTD.Dissolved.Oxygen..mg.L.'
- 'CTD.Photosynthetically.Active.Radiation..µE.m2.s.'
- 'Turbidity..NTU.'
- 'Particulate.Microcystin..µg.L.'
- 'Dissolved.Microcystin..µg.L.'
- 'Extracted.Phycocyanin..µg.L.'
- 'Extracted.Chlorophyll.a..µg.L.'
- 'Total.Phosphorus..µg.P.L.'
- 'Total.Dissolved.Phosphorus..µg.P.L.'
- 'Soluble.Reactive.Phosphorus..µg.P.L.'
- 'Ammonia..µg.N.L.'
- 'Nitrate...Nitrite..mg.N.L.'
- 'Particulate.Organic.Carbon..mg.L.'
- 'Particulate.Organic.Nitrogen..mg.L.'
- 'Colored.Dissolved.Organic.Material.absorbance..m.1..at.400nm'
- 'Total.Suspended.Solids..mg.L.'
- 'Volatile.Suspended.Solids..mg.L.'
- 'date'
- 'station_name'
- 'station_depth_m'
- 'sample_depth_m'
- 'sample_depth_category'
- 'time'
- 'lat'
- 'lon'
- 'wind_speed'
- 'wave_height'
- 'sky'
- 'secchi_depth'
- 'ctd_temp'
- 'ctd_specific_conductivity'
- 'ctd_beam_attenuation'
- 'ctd_transmission'
- 'ctd_dissolved_oxygen'
- 'ctd_par'
- 'turbidity'
- 'particulate_microcystin'
- 'dissolved_microcystin'
- 'extracted_phycocyanin'
- 'extracted_chla'
- 'total_p'
- 'total_dissolved_p'
- 'soluble_reactive_p'
- 'ammonia'
- 'nitrate_nitrite'
- 'particulate_organic_c'
- 'particulate_organic_n'
- 'cdom'
- 'tss'
- 'vss'
- 'date'
- 'station_name'
- 'station_depth_m'
- 'sample_depth_m'
- 'sample_depth_category'
- 'time'
- 'lat'
- 'lon'
- 'wind_speed'
- 'wave_height'
- 'sky'
- 'secchi_depth'
- 'ctd_temp'
- 'ctd_specific_conductivity'
- 'ctd_beam_attenuation'
- 'ctd_transmission'
- 'ctd_dissolved_oxygen'
- 'ctd_par'
- 'turbidity'
- 'particulate_microcystin'
- 'dissolved_microcystin'
- 'extracted_phycocyanin'
- 'extracted_chla'
- 'total_p'
- 'total_dissolved_p'
- 'soluble_reactive_p'
- 'ammonia'
- 'nitrate_nitrite'
- 'particulate_organic_c'
- 'particulate_organic_n'
- 'cdom'
- 'tss'
- 'vss'
Save Dataframe as CSV for Use Downstream#
write.csv(df3,
file = 'data/clean_df.csv',
row.names = FALSE)