Use R to retrieve data from NCEI#

#' R Code for Retrieving NCEI Data
#' Contains functions for retrieving WLE weekly monitoring data
#' from NCEI website and consolidating into one csv file for downstream
#' analysis.
#' 
#' C. Kitchens
#' University of Michigan
#' Cooperative Institute for Great Lakes Research
#' August 2022

# 0 - Install Packages ----------------------------------------------------

# install.packages("httr")
# install.packages("lubridate")
# install.packages("tidyverse")

Import packages#

library(httr)
library(lubridate)
library(tidyverse)
Attaching package: 'lubridate'
The following object is masked from 'package:base':

    date
Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.1.1       v purrr   0.3.2  
v tibble  2.1.1       v dplyr   0.8.0.1
v tidyr   0.8.3       v stringr 1.4.0  
v readr   1.3.1       v forcats 0.4.0  
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x lubridate::as.difftime() masks base::as.difftime()
x lubridate::date()        masks base::date()
x dplyr::filter()          masks stats::filter()
x lubridate::intersect()   masks base::intersect()
x dplyr::lag()             masks stats::lag()
x lubridate::setdiff()     masks base::setdiff()
x lubridate::union()       masks base::union()

Set working directory#

# Generate file path to working directory based on OS in use. OS.type will
# return "unix" if on Mac and "windows" if on PC.
if(.Platform$OS.type == "unix") {
  file_path <- file.path("/volumes",
                           "GoogleDrive",
                           "Shared drives",
                           "WE Stoichiometry paper",
                           "R",
                           fsep = "/")
} else {
  file_path <- file.path("D:",
                           "masonl",
                           "_py",
                           "ncei",
                           fsep = "/")
}

setwd(file_path)

Read in NCEI Data Using API Call#

#' Obtain urls linking to NCEI data for both 2012-2018 data files and 2019 data file
path_2018 = 'https://www.nodc.noaa.gov/archive/arc0135/0187718/2.2/data/0-data/lake_erie_habs_field_sampling_results_2012_2018_v2.csv'
path_2019 = 'https://www.nodc.noaa.gov/archive/arc0152/0209116/1.1/data/0-data/lake_erie_habs_field_sampling_results_2019.csv'
path_2021 = 'https://www.nodc.noaa.gov/archive/arc0204/0254720/1.1/data/0-data/noaa-glerl-erie-habs-field-sampling-results-2020-2021.csv'

#' Use GET to make REST API call to website
file_2018 <- GET(url = path_2018)
file_2019 <- GET(url = path_2019)
file_2021 <- GET(url = path_2021)

#' Turn raw content into csvs
file_2018 <- content(file_2018, as = 'raw')
writeBin(file_2018, 'data/NCEI_data_2018.csv')

file_2019 <- content(file_2019, as = 'raw')
writeBin(file_2019, 'data/NCEI_data_2019.csv')

file_2021 <- content(file_2021, as = 'raw')
writeBin(file_2021, 'data/NCEI_data_2021.csv')


df_2018 <- read.csv(
  file = 'data/NCEI_data_2018.csv',
  header = TRUE, dec = ',',
  strip.white = TRUE,
  na.strings = c(""),
  check.names = TRUE,
  fileEncoding = 'WINDOWS-1252')

df_2019 <- read.csv(
  file = 'data/NCEI_data_2019.csv',
  header = TRUE,
  dec = ',',
  strip.white = TRUE,
  na.strings = c(""),
  check.names = TRUE,
  fileEncoding = 'WINDOWS-1252')

df_2021 <- read.csv(
  file = 'data/NCEI_data_2021.csv',
  header = TRUE,
  dec = ',',
  strip.white = TRUE,
  na.strings = c(""),
  check.names = TRUE)

Use Set Operations to Identify Differences in Data Structure#

colnames(df_2018)
colnames(df_2019)
colnames(df_2021)

#' See what columns are absent across the various data frames
a <- names(df_2018)
b <- names(df_2019)
c <- names(df_2021)

setdiff(a, b)
setdiff(a, c)

setdiff(b, a)
setdiff(b, c)

setdiff(c, a)
setdiff(c, b)

#' Remove columns not shared in common and rename colnames

df_2018 <- df_2018 %>% 
  select(-Sample.Temperature...C.,-Urea..µg.N.L.,-Dissolved.Organic.Carbon..mg.L.) %>% 
  mutate(Date = mdy(Date))

df_2019 <- df_2019 %>% 
  mutate(Date = mdy(Date))

df_2021 <- df_2021 %>% 
  mutate(date = mdy(date))

colnames(df_2018) <- colnames(df_2021)
colnames(df_2019) <- colnames(df_2021)

#' Combine all data frames into a single data frame
df = rbind(df_2018, df_2019, df_2021)

#' Remove unnecessary columns, rename remaining, and create additional needed
#' columns. Note that as.numeric() function coerces any cell with a non-numeric
#' character (e.g. "-1.21") to an NA.
df2 <- df %>%
  select(
    -time,
    -lat,
    -lon,
    -wind_speed,
    -wave_height,
    -sky,
    -secchi_depth,
    -ctd_temp,
    -ctd_specific_conductivity,
    -ctd_beam_attenuation,
    -ctd_transmission,
    -ctd_dissolved_oxygen,
    -ctd_par,
    -particulate_microcystin,
    -dissolved_microcystin
    ) %>% 
  rename(
    Date = date,
    Site = station_name,
    StationDepth_m = station_depth_m,
    SampleDepth_m = sample_depth_m,
    SampleDepth_cat = sample_depth_category,
    Turbidity_NTU = turbidity,
    PC_ugL = extracted_phycocyanin,
    CHLA_ugL = extracted_chla,
    TP_ugL = total_p,
    TDP_ugL = total_dissolved_p,
    SRP_ugL = soluble_reactive_p,
    Ammonia_ugL = ammonia,
    NitrateAndNitrite_mgL = nitrate_nitrite,
    POC_mgL = particulate_organic_c,
    PON_mgL = particulate_organic_n,
    CDOM_400nmABS = cdom,
    TSS_mgL = tss,
    VSS_mgL = vss
    ) %>%
  mutate(
    Date = as.Date(Date, format = '%Y-%m-%d'),
    StationDepth_m = as.numeric(StationDepth_m),
    SampleDepth_m = as.numeric(SampleDepth_m),
    Turbidity_NTU = as.numeric(Turbidity_NTU),
    PC_ugL = as.numeric(PC_ugL),
    CHLA_ugL = as.numeric(CHLA_ugL),
    TP_ugL = as.numeric(TP_ugL),
    TDP_ugL = as.numeric(TDP_ugL),
    SRP_ugL = as.numeric(SRP_ugL),
    Ammonia_ugL = as.numeric(Ammonia_ugL),
    NitrateAndNitrite_mgL = as.numeric(NitrateAndNitrite_mgL),
    POC_mgL = as.numeric(POC_mgL),
    PON_mgL = as.numeric(PON_mgL),
    CDOM_400nmABS = as.numeric(CDOM_400nmABS),
    TSS_mgL = as.numeric(TSS_mgL),
    VSS_mgL = as.numeric(VSS_mgL)
    ) %>%
  #' For columns that have bdls entered as "<XXXX" or "bdl", R interprets
  #' as string instead of numeric, so remove "<" symbol from those entries,
  #' then convert entire columns to numeric objects.The result is a data
  #' frame where entries below limit of detection (LOD) are set equal to that LOD.
  #' For VSS, there is no recorded LOD, so "bdl" entries are set to NA.
  mutate(
    Date = ymd(Date),
    TDP_ugL = as.numeric(gsub("<", "", TDP_ugL)),
    SRP_ugL = as.numeric(gsub("<", "", SRP_ugL)),
    Ammonia_ugL = as.numeric(gsub("<", "", Ammonia_ugL)),
    NitrateAndNitrite_mgL = as.numeric(gsub("<", "", NitrateAndNitrite_mgL)),
    VSS_mgL = as.numeric(gsub("bdl", "", VSS_mgL))
    ) %>%
  mutate(
    JulianDate_DDD = yday(Date),
    PP_ugL = TP_ugL - TDP_ugL
    ) %>% 
  filter(
    SampleDepth_cat == 'Surface',
    Site %in% c('WE2', 'WE4', 'WE6', 'WE8', 'WE9', 'WE12', 'WE13', 'WE14', 'WE15', 'WE16'),
    ) %>%
  select(
    -SampleDepth_cat
    )

#' Add leading 0 to stations less than 10 (e.g. WE2 becomes WE02)
df3 <- df2 %>% 
  mutate(Site = gsub('(WE)(\\d{1})$', 'WE0\\2', Site))
  1. 'Date'
  2. 'Site'
  3. 'Station.Depth..m.'
  4. 'Sample.Depth..m.'
  5. 'Sample.Depth..category.'
  6. 'Local.Time..Eastern.Time.Zone.'
  7. 'Latitude..decimal.deg.'
  8. 'Longitude..decimal.deg.'
  9. 'Wind.speed..knots.'
  10. 'Wave.Height..ft.'
  11. 'Sky'
  12. 'Secchi.Depth..m.'
  13. 'Sample.Temperature...C.'
  14. 'CTD.Temperature...C.'
  15. 'CTD.Specific.Conductivity..µS.cm.'
  16. 'CTD.Beam.Attenuation..m.1.'
  17. 'CTD.Tramission....'
  18. 'CTD.Dissolved.Oxygen..mg.L.'
  19. 'CTD.Photosynthetically.Active.Radiation..µE.m2.s.'
  20. 'Turbidity..NTU.'
  21. 'Particulate.Microcystin..µg.L.'
  22. 'Dissolved.Microcystin..µg.L.'
  23. 'Extracted.Phycocyanin..µg.L.'
  24. 'Extracted.Chlorophyll.a..µg.L.'
  25. 'Total.Phosphorus..µg.P.L.'
  26. 'Total.Dissolved.Phosphorus..µg.P.L.'
  27. 'Soluble.Reactive.Phosphorus..µg.P.L.'
  28. 'Ammonia..µg.N.L.'
  29. 'Nitrate...Nitrite..mg.N.L.'
  30. 'Urea..µg.N.L.'
  31. 'Particulate.Organic.Carbon..mg.L.'
  32. 'Particulate.Organic.Nitrogen..mg.L.'
  33. 'Dissolved.Organic.Carbon..mg.L.'
  34. 'Colored.Dissolved.Organic.Material.absorbance..m.1..at.400nm'
  35. 'Total.Suspended.Solids..mg.L.'
  36. 'Volatile.Suspended.Solids..mg.L.'
  1. 'Date'
  2. 'Site'
  3. 'Station.Depth..m.'
  4. 'Sample.Depth..m.'
  5. 'Sample.Depth..category.'
  6. 'Local.Time..Eastern.Time.Zone.'
  7. 'Latitude..decimal.deg.'
  8. 'Longitude..decimal.deg.'
  9. 'Wind.speed..knots.'
  10. 'Wave.Height..ft.'
  11. 'Sky'
  12. 'Secchi.Depth..m.'
  13. 'CTD.Temperature...C.'
  14. 'CTD.Specific.Conductivity..µS.cm.'
  15. 'CTD.Beam.Attenuation..m.1.'
  16. 'CTD.Tramission....'
  17. 'CTD.Dissolved.Oxygen..mg.L.'
  18. 'CTD.Photosynthetically.Active.Radiation..µE.m2.s.'
  19. 'Turbidity..NTU.'
  20. 'Particulate.Microcystin..µg.L.'
  21. 'Dissolved.Microcystin..µg.L.'
  22. 'Extracted.Phycocyanin..µg.L.'
  23. 'Extracted.Chlorophyll.a..µg.L.'
  24. 'Total.Phosphorus..µg.P.L.'
  25. 'Total.Dissolved.Phosphorus..µg.P.L.'
  26. 'Soluble.Reactive.Phosphorus..µg.P.L.'
  27. 'Ammonia..µg.N.L.'
  28. 'Nitrate...Nitrite..mg.N.L.'
  29. 'Particulate.Organic.Carbon..mg.L.'
  30. 'Particulate.Organic.Nitrogen..mg.L.'
  31. 'Colored.Dissolved.Organic.Material.absorbance..m.1..at.400nm'
  32. 'Total.Suspended.Solids..mg.L.'
  33. 'Volatile.Suspended.Solids..mg.L.'
  1. 'date'
  2. 'station_name'
  3. 'station_depth_m'
  4. 'sample_depth_m'
  5. 'sample_depth_category'
  6. 'time'
  7. 'lat'
  8. 'lon'
  9. 'wind_speed'
  10. 'wave_height'
  11. 'sky'
  12. 'secchi_depth'
  13. 'ctd_temp'
  14. 'ctd_specific_conductivity'
  15. 'ctd_beam_attenuation'
  16. 'ctd_transmission'
  17. 'ctd_dissolved_oxygen'
  18. 'ctd_par'
  19. 'turbidity'
  20. 'particulate_microcystin'
  21. 'dissolved_microcystin'
  22. 'extracted_phycocyanin'
  23. 'extracted_chla'
  24. 'total_p'
  25. 'total_dissolved_p'
  26. 'soluble_reactive_p'
  27. 'ammonia'
  28. 'nitrate_nitrite'
  29. 'particulate_organic_c'
  30. 'particulate_organic_n'
  31. 'cdom'
  32. 'tss'
  33. 'vss'
  1. 'Sample.Temperature...C.'
  2. 'Urea..µg.N.L.'
  3. 'Dissolved.Organic.Carbon..mg.L.'
  1. 'Date'
  2. 'Site'
  3. 'Station.Depth..m.'
  4. 'Sample.Depth..m.'
  5. 'Sample.Depth..category.'
  6. 'Local.Time..Eastern.Time.Zone.'
  7. 'Latitude..decimal.deg.'
  8. 'Longitude..decimal.deg.'
  9. 'Wind.speed..knots.'
  10. 'Wave.Height..ft.'
  11. 'Sky'
  12. 'Secchi.Depth..m.'
  13. 'Sample.Temperature...C.'
  14. 'CTD.Temperature...C.'
  15. 'CTD.Specific.Conductivity..µS.cm.'
  16. 'CTD.Beam.Attenuation..m.1.'
  17. 'CTD.Tramission....'
  18. 'CTD.Dissolved.Oxygen..mg.L.'
  19. 'CTD.Photosynthetically.Active.Radiation..µE.m2.s.'
  20. 'Turbidity..NTU.'
  21. 'Particulate.Microcystin..µg.L.'
  22. 'Dissolved.Microcystin..µg.L.'
  23. 'Extracted.Phycocyanin..µg.L.'
  24. 'Extracted.Chlorophyll.a..µg.L.'
  25. 'Total.Phosphorus..µg.P.L.'
  26. 'Total.Dissolved.Phosphorus..µg.P.L.'
  27. 'Soluble.Reactive.Phosphorus..µg.P.L.'
  28. 'Ammonia..µg.N.L.'
  29. 'Nitrate...Nitrite..mg.N.L.'
  30. 'Urea..µg.N.L.'
  31. 'Particulate.Organic.Carbon..mg.L.'
  32. 'Particulate.Organic.Nitrogen..mg.L.'
  33. 'Dissolved.Organic.Carbon..mg.L.'
  34. 'Colored.Dissolved.Organic.Material.absorbance..m.1..at.400nm'
  35. 'Total.Suspended.Solids..mg.L.'
  36. 'Volatile.Suspended.Solids..mg.L.'
  1. 'Date'
  2. 'Site'
  3. 'Station.Depth..m.'
  4. 'Sample.Depth..m.'
  5. 'Sample.Depth..category.'
  6. 'Local.Time..Eastern.Time.Zone.'
  7. 'Latitude..decimal.deg.'
  8. 'Longitude..decimal.deg.'
  9. 'Wind.speed..knots.'
  10. 'Wave.Height..ft.'
  11. 'Sky'
  12. 'Secchi.Depth..m.'
  13. 'CTD.Temperature...C.'
  14. 'CTD.Specific.Conductivity..µS.cm.'
  15. 'CTD.Beam.Attenuation..m.1.'
  16. 'CTD.Tramission....'
  17. 'CTD.Dissolved.Oxygen..mg.L.'
  18. 'CTD.Photosynthetically.Active.Radiation..µE.m2.s.'
  19. 'Turbidity..NTU.'
  20. 'Particulate.Microcystin..µg.L.'
  21. 'Dissolved.Microcystin..µg.L.'
  22. 'Extracted.Phycocyanin..µg.L.'
  23. 'Extracted.Chlorophyll.a..µg.L.'
  24. 'Total.Phosphorus..µg.P.L.'
  25. 'Total.Dissolved.Phosphorus..µg.P.L.'
  26. 'Soluble.Reactive.Phosphorus..µg.P.L.'
  27. 'Ammonia..µg.N.L.'
  28. 'Nitrate...Nitrite..mg.N.L.'
  29. 'Particulate.Organic.Carbon..mg.L.'
  30. 'Particulate.Organic.Nitrogen..mg.L.'
  31. 'Colored.Dissolved.Organic.Material.absorbance..m.1..at.400nm'
  32. 'Total.Suspended.Solids..mg.L.'
  33. 'Volatile.Suspended.Solids..mg.L.'
  1. 'date'
  2. 'station_name'
  3. 'station_depth_m'
  4. 'sample_depth_m'
  5. 'sample_depth_category'
  6. 'time'
  7. 'lat'
  8. 'lon'
  9. 'wind_speed'
  10. 'wave_height'
  11. 'sky'
  12. 'secchi_depth'
  13. 'ctd_temp'
  14. 'ctd_specific_conductivity'
  15. 'ctd_beam_attenuation'
  16. 'ctd_transmission'
  17. 'ctd_dissolved_oxygen'
  18. 'ctd_par'
  19. 'turbidity'
  20. 'particulate_microcystin'
  21. 'dissolved_microcystin'
  22. 'extracted_phycocyanin'
  23. 'extracted_chla'
  24. 'total_p'
  25. 'total_dissolved_p'
  26. 'soluble_reactive_p'
  27. 'ammonia'
  28. 'nitrate_nitrite'
  29. 'particulate_organic_c'
  30. 'particulate_organic_n'
  31. 'cdom'
  32. 'tss'
  33. 'vss'
  1. 'date'
  2. 'station_name'
  3. 'station_depth_m'
  4. 'sample_depth_m'
  5. 'sample_depth_category'
  6. 'time'
  7. 'lat'
  8. 'lon'
  9. 'wind_speed'
  10. 'wave_height'
  11. 'sky'
  12. 'secchi_depth'
  13. 'ctd_temp'
  14. 'ctd_specific_conductivity'
  15. 'ctd_beam_attenuation'
  16. 'ctd_transmission'
  17. 'ctd_dissolved_oxygen'
  18. 'ctd_par'
  19. 'turbidity'
  20. 'particulate_microcystin'
  21. 'dissolved_microcystin'
  22. 'extracted_phycocyanin'
  23. 'extracted_chla'
  24. 'total_p'
  25. 'total_dissolved_p'
  26. 'soluble_reactive_p'
  27. 'ammonia'
  28. 'nitrate_nitrite'
  29. 'particulate_organic_c'
  30. 'particulate_organic_n'
  31. 'cdom'
  32. 'tss'
  33. 'vss'

Save Dataframe as CSV for Use Downstream#

write.csv(df3,
          file = 'data/clean_df.csv',
          row.names = FALSE)