This script is designed to automate the data query process for Utah Lake data and generate a data output that is readily interface-able with the WASP WRDB.
There are two dataframe outputs which are written to csv in working directory:
1. Raw, as queried data from EPA WQP
2. Processed data for WRDB interface
The target output for the WRDB interface product is a dataframe with one unique result value for each site, date, parameter, and depth level, with a single uniform time stamp and required metadata. To generate this product, this script:
1. Queries data for Utah Lake sites from EPA WQP
2. Removes lab based results for field parameters (pH, temperature, DO, specific conductance)
3. Harmonizes multiple depth attribute columns (relative depth, profile result depth, & grab sample activity depth) to a single attribute
4. Harmonizes multiple time stamps available for a site & date to a single (earliest) time stamp to apply to all records collected at that site & date
5. Extracts a single unique result value for each site, date, parameter, and depth level
Last run:
## [1] "2019-08-24"
#devtools::install_github("utah-dwq/wqTools") # Install DWQ wqTools R package if necessary (see github.com/utah-dwq/wqTools for more info)
library(wqTools)
auid_list=c(
"UT-L-16020201-004_02",
"UT-L-16020201-004_01")
ul_sites=wqTools::readWQP(type='sites', auid=auid_list, siteType=c("Lake, Reservoir, Impoundment"), sampleMedia="Water") #Connection to WQP occasionally times out, function tries connection up to 10 times
## [1] "https://www.waterqualitydata.us/data/Station/search?siteType=Lake%2C%20Reservoir%2C%20Impoundment&sampleMedia=Water&bBox=-111.9435321%2C40.0089953%2C-111.6670281%2C40.3640109&mimeType=csv&zip=no"
map=wqTools::buildMap(sites=ul_sites, plot_polys=F)
map=leaflet::setView(map, lng=median(ul_sites$LongitudeMeasure), lat=median(ul_sites$LatitudeMeasure), zoom=10)
map
NOTE:
WQP data are queried as four separate components: 1) site metadata, 2) result values (narrow result), 3) activity metadata (activity), and 4) detection/quantitation limits. Site metadata, activity metadata, and detection/quantitation limits are all merged to narrow result to generate a single dataset. Multiple det/quant limits are often available for a single water quality result. These are cast to a wide format before merging to narrow result to maintain all available det/quant limits.
ul_nr=wqTools::readWQP(type='narrowresult', auid=auid_list, siteType=c("Lake, Reservoir, Impoundment"), sampleMedia="Water", print=F)
## [1] "https://www.waterqualitydata.us/data/Result/search?siteType=Lake%2C%20Reservoir%2C%20Impoundment&sampleMedia=Water&bBox=-111.9435321%2C40.0089953%2C-111.6670281%2C40.3640109&mimeType=csv&zip=no&dataProfile=narrowResult"
ul_act=wqTools::readWQP(type='activity', auid=auid_list, sampleMedia="Water")
## [1] "https://www.waterqualitydata.us/data/Activity/search?sampleMedia=Water&bBox=-111.9435321%2C40.0089953%2C-111.6670281%2C40.3640109&mimeType=csv&zip=no&dataProfile=activityAll"
ul_dql=wqTools::readWQP(type='detquantlim', auid=auid_list, sampleMedia="Water")
## [1] "https://www.waterqualitydata.us/data/ResultDetectionQuantitationLimit/search?sampleMedia=Water&bBox=-111.9435321%2C40.0089953%2C-111.6670281%2C40.3640109&mimeType=csv&zip=no"
(values and units cast separately and merged back together)
dql_vals=reshape2::dcast(OrganizationIdentifier+OrganizationFormalName+ActivityIdentifier+MonitoringLocationIdentifier+ResultIdentifier+CharacteristicName~DetectionQuantitationLimitTypeName, value.var='DetectionQuantitationLimitMeasure.MeasureValue', data=ul_dql)
names(dql_vals)=gsub(" ","", names(dql_vals))
dql_units=reshape2::dcast(ResultIdentifier~DetectionQuantitationLimitTypeName, value.var='DetectionQuantitationLimitMeasure.MeasureUnitCode', data=ul_dql)
names(dql_units)[2:dim(dql_units)[2]] = paste(names(dql_units)[2:dim(dql_units)[2]], "Unit")
names(dql_units)=gsub(" ","", names(dql_units))
ul_dql_wide=merge(dql_vals, dql_units)
if(dim(dql_units)[1]!=dim(dql_vals)[1] | dim(dql_units)[1]!=dim(ul_dql_wide)[1]){
stop("DQL dimensions do not match.")
}
ul_data=merge(ul_nr,ul_act, all.x=T)
ul_data=merge(ul_data,ul_sites, all.x=T)
ul_data=merge(ul_data,ul_dql_wide, all.x=T)
dim(ul_data)[1] == dim(ul_nr)[1] # Lengths of merged and narrowresult should match
## [1] TRUE
write.csv(file=paste0("ul_data_wqp_raw_",Sys.Date(),".csv"), ul_data, row.names=F)
ul_data=ul_data[ul_data$MonitoringLocationIdentifier!="UTAHDWQ_WQX-4917320",]
field_params=c("pH","Specific conductance","Dissolved oxygen (DO)","Dissolved oxygen saturation","Temperature, water")
knitr::kable(table(ul_data[ul_data$CharacteristicName %in% field_params,"ActivityTypeCode"]))
Var1 | Freq |
---|---|
Field Msr/Obs | 4463 |
Field Msr/Obs-Portable Data Logger | 15860 |
Sample-Integrated Vertical Profile | 164 |
Sample-Routine | 412 |
ul_data=subset(ul_data,
!(
(CharacteristicName %in% field_params) &
(ActivityTypeCode %in% c("Sample-Routine","Sample-Integrated Vertical Profile","Quality Control Sample-Field Replicate"))
)
)
knitr::kable(table(ul_data[ul_data$CharacteristicName %in% field_params,"ActivityTypeCode"]))
Var1 | Freq |
---|---|
Field Msr/Obs | 4463 |
Field Msr/Obs-Portable Data Logger | 15860 |
ul_data=within(ul_data,{
SampleDepthValue=ifelse(!is.na(ActivityDepthHeightMeasure.MeasureValue), ActivityDepthHeightMeasure.MeasureValue, ResultDepthHeightMeasure.MeasureValue)
SampleDepthUnit=ifelse(!is.na(ActivityDepthHeightMeasure.MeasureUnitCode), as.character(ActivityDepthHeightMeasure.MeasureUnitCode), as.character(ResultDepthHeightMeasure.MeasureUnitCode))
})
NOTE:
Many records missing both depth fields have a RelativeDepth filled in which could potentially be used to fill in values. Others are for CharacteristicName == “Depth, data-logger (ported)” and can be filled with the result value and unit.
knitr::kable(table(is.na(ul_data$SampleDepthValue) & is.na(ul_data$ActivityRelativeDepthName)))
Var1 | Freq |
---|---|
FALSE | 45403 |
TRUE | 5432 |
knitr::kable(table(is.na(ul_data$SampleDepthValue) & ul_data$CharacteristicName=="Depth, data-logger (ported)"))
Var1 | Freq |
---|---|
FALSE | 50255 |
TRUE | 580 |
ul_data=within(ul_data,{
SampleDepthValue=ifelse(is.na(SampleDepthValue) & CharacteristicName == "Depth, data-logger (ported)", ResultMeasureValue, SampleDepthValue)
SampleDepthUnit=ifelse(is.na(SampleDepthUnit) & CharacteristicName == "Depth, data-logger (ported)", as.character(ResultMeasure.MeasureUnitCode), as.character(SampleDepthUnit))
})
knitr::kable(table(is.na(ul_data$SampleDepthValue) & ul_data$CharacteristicName=="Depth, data-logger (ported)"))
Var1 | Freq |
---|---|
FALSE | 50835 |
ul_data$SampleDepthValue[ul_data$ActivityRelativeDepthName=="Bottom" & ul_data$ActivityDepthHeightMeasure.MeasureValue==0] = NA
ul_data$RelativeDepth=ifelse(is.na(ul_data$SampleDepthValue) & !is.na(ul_data$ActivityRelativeDepthName), as.character(ul_data$ActivityRelativeDepthName), NA)
times=unique(ul_data[,c("MonitoringLocationIdentifier", "ActivityStartDate", "ActivityStartTime.Time", "ActivityStartTime.TimeZoneCode")])
times=times[!is.na(times$ActivityStartTime.Time),]
knitr::kable(table(times$ActivityStartTime.TimeZoneCode))
Var1 | Freq |
---|---|
EST | 2 |
MDT | 17 |
MST | 1971 |
*Note - | a few samples marked as MDT time zone. Default seems to be MST. Accepting all times and setting tz to “America/Denver”.* |
times$datetime=as.POSIXct(paste(times$ActivityStartDate, times$ActivityStartTime.Time), format="%Y-%m-%d %H:%M:%S", tz="America/Denver")
site_date_starttime=aggregate(datetime~MonitoringLocationIdentifier+ActivityStartDate, data=times,FUN='min')
site_date_starttime=within(site_date_starttime, {
hour=lubridate::hour(datetime)
minute=lubridate::minute(datetime)
})
dim(site_date_starttime)
## [1] 1111 5
dim(unique(site_date_starttime[,c("MonitoringLocationIdentifier","ActivityStartDate")]))
## [1] 1111 2
ul_data=merge(ul_data, site_date_starttime, all.x=T)
dim(ul_data)
## [1] 50835 197
cols_keep=c(
"OrganizationIdentifier","OrganizationFormalName","MonitoringLocationIdentifier","MonitoringLocationName","MonitoringLocationTypeName",
"LatitudeMeasure","LongitudeMeasure","HorizontalCoordinateReferenceSystemDatumName",
### These are the columns that have been added through the code above
"SampleDepthUnit","SampleDepthValue","RelativeDepth","datetime","minute","hour",
"CharacteristicName","ResultSampleFractionText","ResultMeasureValue","ResultMeasure.MeasureUnitCode",
"MethodSpecificationName","MeasureQualifierCode","ResultDetectionConditionText",
"ResultStatusIdentifier","ResultValueTypeName","ActivityMediaName",
"EstimatedQuantitationLimit","LowerQuantitationLimit","LowerReportingLimit","MethodDetectionLevel",
"UpperQuantitationLimit","EstimatedQuantitationLimitUnit",
"LowerQuantitationLimitUnit","LowerReportingLimitUnit","MethodDetectionLevelUnit","UpperQuantitationLimitUnit"
)
dim(ul_data)
## [1] 50835 197
ul_data=unique(ul_data[,cols_keep])
dim(ul_data)
## [1] 47115 34
check_data=ul_data[,c("MonitoringLocationIdentifier","datetime","SampleDepthValue","RelativeDepth","CharacteristicName","ResultSampleFractionText")]
dim(unique(check_data))
## [1] 43851 6
dim(ul_data)[1] == dim(unique(check_data))[1]
## [1] FALSE
dim(ul_data)
## [1] 47115 34
dim(unique(ul_data))
## [1] 47115 34
NOTE:
Some records have one or more unique result value. These are flagged in processed data w/ result_count column.
result_count_data=ul_data[,c("MonitoringLocationIdentifier","datetime","SampleDepthValue","RelativeDepth","CharacteristicName","ResultSampleFractionText","ResultMeasureValue")]
result_count_data=lapply(result_count_data, addNA)
result_count=aggregate(ResultMeasureValue~MonitoringLocationIdentifier+datetime+SampleDepthValue+RelativeDepth+CharacteristicName+ResultSampleFractionText, result_count_data, FUN='length')
names(result_count)[names(result_count)=="ResultMeasureValue"]="result_count"
dim(ul_data)
## [1] 47115 34
ul_data=merge(ul_data, result_count, all.x=T)
dim(ul_data)
## [1] 47115 35
table(ul_data$result_count)
##
## 1 2 3 4 5 6 9 12 21
## 42244 1072 1953 1188 520 96 9 12 21
write.csv(file=paste0("ul_data_wqp_processed_",Sys.Date(),".csv"), ul_data, row.names=F)