MODISSummaries <- function(LoadDat, FileSep = NULL, Dir = ".", Product, Bands, ValidRange, NoDataFill, ScaleFactor, StartDate = FALSE, QualityScreen = FALSE, QualityBand = NULL, QualityThreshold = NULL, Mean = TRUE, SD = TRUE, Min = TRUE, Max = TRUE, Yield = FALSE, Interpolate = FALSE, InterpolateN = NULL, DiagnosticPlot = FALSE) { # DEFINE NUM_METADATA_COLS <- 10 if(Dir == '.') cat('MODIS data files from ', getwd(), ' will be summarised.\nSummary files will be written to the same directory.\n', sep = '') if(Dir != '.') cat('MODIS data files from ', Dir, ' will be summarised.\nSummary files will be written to the same directory.\n', sep = '') # Load input time-series data file; external data file, or an R object. if(!is.object(LoadDat) & !is.character(LoadDat)) stop("LoadDat must be an object in R or a file path character string.") if(is.object(LoadDat)) details <- data.frame(LoadDat) if(is.character(LoadDat)){ if(!file.exists(LoadDat)) stop("Character string input for LoadDat does not resemble an existing file path.") if(is.null(FileSep)) stop("To load a file as input, you must also specify its delimiter (FileSep).") details <- read.delim(LoadDat, sep = FileSep) } ##### # Check lat and long data frame columns are named "lat" and "long" as necessary. if(!any(names(details) == "lat") | !any(names(details) == "long")){ stop("Could not find columns for latitude and/or longitude in your data set. Must be named 'lat' and 'long'.") } if(!file.exists(Dir)) stop("Character string input for Dir argument does not resemble an existing file path.") # Check valid inputs for the quality screening process. if(!is.logical(QualityScreen)) stop("QualityScreen argument should be a logical input. See help ?MODISSummaries.") if(QualityScreen){ if(is.null(QualityBand) | is.null(QualityThreshold)) stop("QualityBand and QualityThreshold not specified.") } product.bands <- try(GetBands(Product), silent = TRUE) if(class(product.bands) != "try-error"){ # Check Band and QualityBand belong to Product. if(!all(Bands %in% product.bands)) stop(paste("Band input does not match with", Product, "product.", sep = " ")) if(QualityScreen){ if(Product == "MCD43A4"){ if(QualityBand != "BRDF_Albedo_Band_Quality") stop("QualityBand input is not QA data for MCD43A4 product.") } else { if(!any(product.bands == QualityBand)) stop(paste("QualityBand is not QA data for", Product, "product.", sep = " ")) } } } else { cat("MODIS server temporarily overloaded so user input checking skipped.") } # NoDataFill should be one integer. if(length(NoDataFill) != 1) stop("NoDataFill input must be one integer.") if(!is.numeric(NoDataFill)) stop("NoDataFill should be numeric class. One integer.") if(abs(NoDataFill - round(NoDataFill)) > .Machine$double.eps^0.5) stop("NoDataFill input must be one integer.") # ValidRange should be a numeric vector, length 2. if(length(ValidRange) != 2) stop("ValidRange input must be a numeric vector - an upper and lower bound.") if(!is.numeric(ValidRange)) stop("ValidRange should be numeric class.") # ScaleFactor should be numeric, length 1. if(length(ScaleFactor) != 1) stop("ScaleFactor input must be one numeric element.") if(!is.numeric(ScaleFactor)) stop("ScaleFactor should be numeric class.") # Year or posixt date format? if(all(is.na(details$end.date))) stop("Time-series end.date variable in LoadDat is empty.") Year <- FALSE POSIXt <- FALSE posix.compatible <- try(as.POSIXlt(details$end.date), silent=TRUE) if(any(class(details$end.date) == "POSIXt") | all(class(posix.compatible) != "try-error")) POSIXt <- TRUE if(all(is.numeric(details$end.date) & nchar(as.character(details$end.date)) == 4) & any(class(posix.compatible) == "try-error")) Year <- TRUE if(!Year & !POSIXt) stop("Date informDate information in LoadDat is not recognised as years or as POSIXt format.") if(Year & POSIXt) stop("Date information in LoadDat is recognised as both year and POSIXt formats.") if(StartDate){ if(!any(names(details) == "start.date") | all(is.na(details$start.date))){ stop("StartDate == TRUE, but no start.date field found in LoadDat.") } } ##### # Date and time stamp date <- as.POSIXlt(Sys.time()) file.date <- paste(as.Date(date), paste(paste0("h",date$hour), paste0("m",date$min), paste0("s",round(date$sec,digits=0)), sep = "-"), sep = "_") # Get a list of all downloaded subset (.asc) files in the data directory. file.list <- list.files(path = Dir, pattern = paste(Product, ".*asc$", sep = "")) if(length(file.list) == 0) stop("Found no MODIS data files in Dir that match the request.") size.test <- sapply(file.path(Dir, file.list), function(x) ncol(read.csv(x)[1, ]) - NUM_METADATA_COLS) if(!all(size.test == size.test[1])) stop("The number of pixels (Size) in subsets identified are not all the same.") # size.test checked all subsets are compatible for processing to one summary file. Can now just use size.test[1] num.pixels <- unname(size.test[1]) # Extract IDs for ASCII files so they can be included in summary output; ncol = length(file.list), nrow = size.test. where.id <- regexpr("___", file.list) id <- matrix(unname(mapply(function(x, y, z) rep(substr(x, 1, y - 1), z), x = file.list, y = where.id, z = size.test, SIMPLIFY = "array")), nrow = size.test, ncol = length(file.list)) # Time-series analysis for each time-series (ASCII file) consecutively. band.data.site <- lapply((size.test * length(Bands)), function(x) matrix(nrow = x, ncol = 13)) for(counter in 1:length(file.list)){ cat("Processing file ", counter, " of ", length(file.list), "...\n", sep = "") ##### Load selected .asc file into a data frame, name columns and tell user what's being processed. ds <- read.csv(file.path(Dir, file.list[counter]), header = FALSE, as.is = TRUE) names(ds) <- c("nrow", "ncol", "xll", "yll", "pixelsize", "row.id", "product.code", "MODIS.acq.date", "where", "MODIS.proc.date", 1:(ncol(ds) - NUM_METADATA_COLS)) # Extract year and day from the metadata and make POSIXlt dates (YYYY-MM-DD), ready for time-series analysis. year <- as.numeric(substr(ds$MODIS.acq.date, 2, 5)) day <- as.numeric(substr(ds$MODIS.acq.date, 6, 8)) date <- strptime(paste(year, "-", day, sep = ""), "%Y-%j") ds <- cbind(ds[,1:NUM_METADATA_COLS], date, ds[,(NUM_METADATA_COLS+1):ncol(ds)]) w.ds.dat <- which(names(ds) == "date") + 1 ##### ##### Section that uses the files metadata strings [ ,1:5] for each time-series to extract necessary information. wherelong <- regexpr("Lon", ds$where[1]) whereSamp <- regexpr("Samp", ds$where[1]) lat <- as.numeric(substr(ds$where[1], 4, wherelong - 1)) long <- as.numeric(substr(ds$where[1], wherelong + 3, whereSamp - 1)) rowIdBands <- Reduce(rbind, strsplit(ds$row.id, "[.]")) rowIdBands <- rowIdBands[ ,ncol(rowIdBands)] # Check that all bands listed are in the ASCII files. if(!all(Bands %in% rowIdBands)) stop("Not all Bands are represented in data file.") # Identify which rows in ds correspond to the quality data and put into a matrix. if(QualityScreen){ ifelse(QualityBand %in% rowIdBands, which.QA <- which(QualityBand == rowIdBands), stop("Cannot find quality data in LoadDat. Download quality data with band data in MODISSubsets.")) QA.time.series <- as.matrix(ds[which.QA,w.ds.dat:ncol(ds)], nrow = length(which.QA), ncol = length(w.ds.dat:ncol(ds))) } ##### for(bands in 1:length(Bands)){ # Find rows in ds that correspond to Bands[bands] data and put into a matrix. ifelse(Bands[bands] %in% rowIdBands, which.band <- which(Bands[bands] == rowIdBands), stop("Cannot find band data in LoadDat. Make sure ASCII files in the directory are from MODISSubsets.")) band.time.series <- as.matrix(ds[which.band,w.ds.dat:ncol(ds)], nrow = length(which.band), ncol = length(w.ds.dat:ncol(ds))) ##### Screen pixels in band.time.series: any NoDataFill or pixels < QualityThreshold, will be replaced with NA. ifelse(QualityScreen, band.time.series <- QualityCheck(Data = band.time.series, QualityScores = QA.time.series, Band = Bands[bands], NoDataFill = NoDataFill, QualityBand = QualityBand, Product = Product, QualityThreshold = QualityThreshold), band.time.series <- matrix(ifelse(band.time.series != NoDataFill, band.time.series, NA), nrow = length(which.band))) # Final check, that band values all fall within the ValidRange (as defined for given MODIS product band). if(any(!(band.time.series >= ValidRange[1] && band.time.series <= ValidRange[2]), na.rm = TRUE)){ stop("Some values fall outside the valid range, after no fill values should have been removed.") } #### # Initialise objects for various summaries. mean.band <- rep(NA, num.pixels) sd.band <- rep(NA, num.pixels) band.yield <- rep(NA, num.pixels) nofill <- rep(NA, num.pixels) poorquality <- rep(NA, num.pixels) band.min <- rep(NA, num.pixels) band.max <- rep(NA, num.pixels) if(DiagnosticPlot & num.pixels != 1){ cat("Can not provide diagnostic plots for subsets larger than a single pixel") DiagnosticPlot <- FALSE } # Run time-series analysis for the ith for(i in 1:ncol(band.time.series)){ # Minimum and maximum band values observed. minobsband <- min(as.numeric(band.time.series[ ,i]) * ScaleFactor, na.rm = TRUE) maxobsband <- max(as.numeric(band.time.series[ ,i]) * ScaleFactor, na.rm = TRUE) # Assess quality of data at this time-step, to decide how to proceed with analysis. data.quality <- ifelse(QualityScreen, sum(!is.na(band.time.series[ ,i])), 2) if(data.quality >= 2){ # Linearly interpolate between screened data points, for each pixel, over time (default is daily). if(Interpolate){ if(is.null(InterpolateN)){ #N <- max(ds$date[!is.na(band.time.series[ ,i])]) - min(ds$date[!is.na(band.time.series[ ,i])]) InterpolateN <- round(0.9 * as.numeric(length(band.time.series))) } sout <- approx(x = 1:nrow(band.time.series), y = as.numeric(band.time.series[ ,i]) * ScaleFactor, method = "linear", n = InterpolateN) } # Carry out all the relevant summary analyses, set by options in the function call. # (((365*length(years))-16)*365) = average annual yield (i.e. work out daily yield * 365). if(Yield) band.yield[i] <- (sum(sout$y) - minobsband * length(sout$x)) / length(sout$x) if(Mean){ ifelse(Interpolate, mean.band[i] <- mean(sout$y), mean.band[i] <- mean(as.numeric(band.time.series[ ,i]) * ScaleFactor, na.rm = TRUE)) } if(SD){ ifelse(Interpolate, sd.band[i] <- sd(sout$y), sd.band[i] <- sd(as.numeric(band.time.series[ ,i]) * ScaleFactor, na.rm = TRUE)) } } if(data.quality == 1){ warning("Only single data point that passed the quality screen: cannot summarise", immediate. = TRUE) if(Mean) mean.band[i] <- mean(as.numeric(band.time.series[ ,i]) * ScaleFactor, na.rm = TRUE) } if(data.quality == 0) warning("No reliable data for this pixel", immediate. = TRUE) # Complete final optional summaries, irrespective of data quality. if(Min) band.min[i] <- minobsband if(Max) band.max[i] <- maxobsband nofill[i] <- paste(round((sum(ds[ ,i + (w.ds.dat - 1)] == NoDataFill) / length(band.time.series[ ,i])) * 100, 2), "% (", sum(ds[ ,i + (w.ds.dat - 1)] == NoDataFill), "/", length(band.time.series[ ,i]), ")", sep = "") ifelse(QualityScreen, poorquality[i] <- paste(round((sum(QA.time.series[ ,i] > QualityThreshold) / length(QA.time.series[ ,i])) * 100, 2), "% (", sum(QA.time.series[ ,i] > QualityThreshold), "/", length(QA.time.series[ ,i]), ")", sep = ""), poorquality[i] <- NA) } # End of loop for time-series summary analysis for each pixel. # Compile time-series information and relevant summaries data into a final output listed by-sites (.asc files). band.data.site[[counter]][(((bands - 1) * num.pixels) + 1):(num.pixels * bands), ] <- matrix(data = c(ID = id[ ,counter], lat = rep(lat, num.pixels), long = rep(long, num.pixels), start.date = rep(as.character(min(ds$date)), num.pixels), end.date = rep(as.character(max(ds$date)), num.pixels), data.band = rep(Bands[bands], num.pixels), min.band = band.min, max.band = band.max, mean.band = mean.band, sd.band = sd.band, band.yield = band.yield, no.fill.data = nofill, poor.quality.data = poorquality), nrow = num.pixels, ncol = 13) if(DiagnosticPlot){ directory <- file.path(Dir, "DiagnosticPlots") if(file.exists(directory) == FALSE) dir.create(directory) filename <- file.path(directory, paste(id[counter], Product, Bands[bands], file.date, ".pdf", sep = "_")) if(data.quality == 1 | data.quality == 0){ cat("Too few data points for diagnostic plot for this site\n") } else { pdf(filename) SiteId <- max(paste(lat, long, year)) plot(band.time.series[,i] * ScaleFactor, pch = 19, main = id[counter], xlab = "Timestep", ylab = Bands[bands], ylim = c(-0.1, 1)) if(Interpolate) lines(sout, col = "red") if(Mean) abline(a = mean.band[i], b=0, lty = 2, col = "red") if(Min) abline(a = band.min[i], b=0, lty = 2) if(Max) abline(a = band.max[i], b=0, lty = 2) dev.off() } } } # End of loop for Bands. } # End of loop for each ASCII file. band.data.site <- as.data.frame(do.call("rbind", band.data.site), stringsAsFactors = FALSE) names(band.data.site) <- c("ID", "lat", "long", "start.date", "end.date", "data.band", "min.band", "max.band", "mean.band", "sd.band", "band.yield", "no.fill.data", "poor.quality.data") ### Create SubsetID variable as created in MODISSubsets for correct matching. if(StartDate){ lat.long <- details[!is.na(details$lat) | !is.na(details$long) | !is.na(details$end.date) | !is.na(details$start.date), ] lat.long <- lat.long[!duplicated(data.frame(lat.long$lat, lat.long$long, lat.long$end.date, lat.long$start.date)), ] } else if(!StartDate){ lat.long <- details[!is.na(details$lat) | !is.na(details$long) | !is.na(details$end.date), ] lat.long <- lat.long[!duplicated(data.frame(lat.long$lat, lat.long$long, lat.long$end.date)), ] } ID.check <- ifelse(any(names(details) == "ID"), TRUE, FALSE) n.unique <- FALSE if(ID.check){ # Check if ID variable can identify unique time series n.unique <- length(unique(lat.long$ID)) == nrow(lat.long) if(n.unique) names(lat.long)[names(lat.long) == "ID"] <- "SubsetID" } else if(!ID.check | !n.unique){ # Create end.date and start.date Year <- FALSE POSIXt <- FALSE posix.compatible <- try(as.POSIXlt(lat.long$end.date), silent = TRUE) if(any(class(lat.long$end.date) == "POSIXt") | all(class(posix.compatible) != "try-error")) POSIXt <- TRUE if(all(is.numeric(lat.long$end.date) & nchar(as.character(lat.long$end.date)) == 4) & any(class(posix.compatible) == "try-error")) Year <- TRUE if(!Year & !POSIXt) stop("Date information in LoadDat is not recognised as years or as POSIXt format.") if(Year & POSIXt) stop("Date information in LoadDat is recognised as both year and POSIXt formats.") if(Year){ end.date <- strptime(paste(lat.long$end.date, "-12-31", sep = ""), "%Y-%m-%d") if(StartDate){ start.year.fail <- any(!is.numeric(lat.long$start.date) | nchar(lat.long$start.date) != 4) if(start.year.fail) stop("end.date identified as year dates, but start.date does not match.") start.date <- strptime(paste(lat.long$start.date, "-01-01", sep = ""), "%Y-%m-%d") } } else if(POSIXt){ end.date <- strptime(lat.long$end.date, "%Y-%m-%d") if(StartDate){ start.posix.fail <- any(class(try(as.POSIXlt(lat.long$end.date), silent = TRUE)) == "try-error") if(start.posix.fail) stop("end.date identified as POSIXt dates, but start.date does not match.") start.date <- strptime(lat.long$start.date, "%Y-%m-%d") } } if(!StartDate){ date.ids <- mapply(function(x,y,z) unique(band.data.site$ID[band.data.site$lat == x & band.data.site$long == y & grepl(z, band.data.site$ID)]), x = lat.long$lat, y = lat.long$long, z = as.character(end.date)) if(!all(sapply(date.ids, length) == 1)){ stop("Couldn't match summarised data back with original dataset.\n", "Make sure LoadDat is exact same dataset used to download MODIS data being summarised.") } start.date <- substr(date.ids, regexpr("Start", date.ids)+5, regexpr("End", date.ids)-1) } lat.long <- data.frame(SubsetID = paste("Lat", sprintf("%.5f", lat.long$lat), "Lon", sprintf("%.5f", lat.long$long), "Start", start.date, "End", end.date, sep = ""), lat.long, stringsAsFactors = FALSE) } ### res <- data.frame(cbind(details, matrix(NA, nrow = nrow(details), ncol = 1+(num.pixels * length(Bands))))) names(res) <- c(names(details), "SubsetID", paste(rep(Bands, each = num.pixels), "_pixel", rep(1:num.pixels, times = length(Bands)), sep = "")) for(i in 1:length(lat.long$SubsetID)){ res$SubsetID[sprintf("%.5f", res$lat) %in% sprintf("%.5f", as.numeric(lat.long$lat[i])) & sprintf("%.5f", res$long) %in% sprintf("%.5f", as.numeric(lat.long$long[i])) & res$end.date %in% lat.long$end.date[i]] <- lat.long$SubsetID[i] res[res$SubsetID %in% lat.long$SubsetID[i],(which(names(res) == "SubsetID")+1):ncol(res)] <- matrix(band.data.site$mean.band[band.data.site$ID == lat.long$SubsetID[i]], nrow = nrow(as.matrix(res[res$SubsetID %in% lat.long$SubsetID[i],(which(names(res) == "SubsetID")+1):ncol(res)])), ncol = ncol(as.matrix(res[res$SubsetID %in% lat.long$SubsetID[i],(which(names(res) == "SubsetID")+1):ncol(res)])), byrow = TRUE) } ##### # Write output summary file by appending summary data from all files, producing one file of summary stats output. write.table(band.data.site, file = file.path(Dir, paste("MODIS_Summary_", Product, "_", file.date, ".csv", sep = "")), sep = ",", row.names = FALSE) # Write the final appended dataset to a csv file, ready for use, in Dir. write.table(res, file = file.path(Dir, paste("MODIS_Data_", Product, "_", file.date, ".csv", sep = "")), sep = ",", col.names = TRUE, row.names = FALSE) ##### cat("Done! Check the 'MODIS Summary' and 'MODIS Data' output files.\n") }