1. Repeat the exercise from the Batch Processing Lecture (5 April), but do it using real data sets rather than purely simulated. Check with folks in your lab to see if there are multiple data sets available for analysis, or ask Nick, Lauren, or Emily for suggestions for other data sources. Stick to simple data analyses and graphics, but try to set it up as a batch process that will work on multiple files and save summary results to a common file.

For this Homework, I am working with 18 .csv files containing zooplankton length data. Each .csv file contains data from a different sample, and I am going to use batch processing to calculate the same summary statistics for each sample (mean length and sd length), and then save these summary statistics to an output summary file.

# Function to get summary statistics from each of the individual files (mean and sd of zooplankton lengths)

################################
# FUNCTION: length_stats
# purpose: extract mean and sd of zooplankton lengths
# input: data frame with columns "sampleID" and "length_mm"
# output: list containing mean length and sd length for that sample
# ------------------------------
length_stats <- function(d = NULL) {
  
  if(is.null(d)) { # create fake data set for no inputs to the function
    sampleID <- rep(sample.int(40,1), times = 20)
    length_mm <- runif(20, min = 0.1, max = 0.4)
    d <- data.frame(sampleID, length_mm)
  }
  
  stats_list <- list(mean_length <- mean(d$length_mm), # calculate mean length
                     sd_length <- sd(d$length_mm))     # calculate sd length
                
  return(stats_list)
}

#length_stats() # test length_stats function
# Batch processing

set.seed(2000)

# Global variables
file_folder <- "FOAMZ_2019_zoop/" # folder where the .csv files are in my R project
n_files <- 18 # number of files
file_out <- "LengthSummary.csv" # output summary file

# Grab the file names
file_names <- list.files(path = file_folder)
head(file_names)
## [1] "M1_01AUG2019_N_EP_022.csv" "M1_01AUG2019_N_HP_023.csv"
## [3] "M1_01AUG2019_N_IL_025.csv" "M1_01AUG2019_N_OL_024.csv"
## [5] "M1_31JUL2019_D_EP_001.csv" "M1_31JUL2019_D_HP_002.csv"
# Create a data frame to hold summary file statistics
ID <- seq_along(file_names)
file_name <- file_names
mean_length <- rep(NA, length(file_names))
sd_length <- rep(NA, length(file_names))
stats_out <- data.frame(ID, file_name, mean_length, sd_length) # create stats_out
head(stats_out)
##   ID                 file_name mean_length sd_length
## 1  1 M1_01AUG2019_N_EP_022.csv          NA        NA
## 2  2 M1_01AUG2019_N_HP_023.csv          NA        NA
## 3  3 M1_01AUG2019_N_IL_025.csv          NA        NA
## 4  4 M1_01AUG2019_N_OL_024.csv          NA        NA
## 5  5 M1_31JUL2019_D_EP_001.csv          NA        NA
## 6  6 M1_31JUL2019_D_HP_002.csv          NA        NA
# Batch process by looping through the individual files

for (i in seq_along(file_names)) {
  data <- read.table(file = paste(file_folder,
                                  file_names[i],
                                  sep = ""),
                     sep = ",",
                     header = TRUE)

  . <- length_stats(data) # pull out length stats
  
  stats_out[i,3:4] <- unlist(.) # working with columns 3 to 4 (mean_length and sd_length) - unlist, copy into last two columns of stats_out

} # end of for loop

# Set up output file and incorporate a time stamp and minimal metadata
write.table(cat("# Summary stats for",
                "batch processing of zooplankton length data", "\n",
                "# timestamp: ",
                as.character(Sys.time()), "\n",
                file = file_out, # what we are going to call this final file
                row.names = "",
                col.names = "",
                sep = ""))
## ""
# Now, add the data frame
write.table(x = stats_out,
            file = file_out,
            row.names = FALSE,
            col.names = TRUE,
            sep = ",",
            append = TRUE)
## Warning in write.table(x = stats_out, file = file_out, row.names = FALSE, :
## appending column names to file

Taking a look at the output summary:

print(stats_out)
##    ID                 file_name mean_length  sd_length
## 1   1 M1_01AUG2019_N_EP_022.csv   0.3185155 0.13008865
## 2   2 M1_01AUG2019_N_HP_023.csv   0.3870420 0.21738754
## 3   3 M1_01AUG2019_N_IL_025.csv   0.3443937 0.24591226
## 4   4 M1_01AUG2019_N_OL_024.csv   0.3087089 0.14444251
## 5   5 M1_31JUL2019_D_EP_001.csv   0.3792819 0.18455917
## 6   6 M1_31JUL2019_D_HP_002.csv   0.3209353 0.13370286
## 7   7 M1_31JUL2019_D_IL_004.csv   0.3136393 0.21895262
## 8   8 M1_31JUL2019_D_MC_005.csv   0.2362327 0.09475741
## 9   9 M1_31JUL2019_D_OL_003.csv   0.2800608 0.12545326
## 10 10 M2_01AUG2019_N_EP_026.csv   0.4150776 0.23374166
## 11 11 M2_01AUG2019_N_HP_027.csv   0.3856111 0.22506159
## 12 12 M2_01AUG2019_N_IL_029.csv   0.3153525 0.19036970
## 13 13 M2_01AUG2019_N_OL_028.csv   0.3846906 0.20021767
## 14 14 M2_31JUL2019_D_EP_006.csv   0.3990312 0.17918518
## 15 15 M2_31JUL2019_D_HP_007.csv   0.4020613 0.21256396
## 16 16 M2_31JUL2019_D_IL_009.csv   0.2958483 0.15327264
## 17 17 M2_31JUL2019_D_MC_010.csv   0.2965415 0.18537989
## 18 18 M2_31JUL2019_D_OL_008.csv   0.2188298 0.17520853

Our output summary, which we also made into a .csv file called “LengthSummary.csv” in the previous steps, contains columns for ID, file_name, mean_length, and sd_length. Now that we have the mean and sd lengths for each of the samples (each of the original .csv files), we can plot summaries of the data using the stats_out data frame. With this script, it will be easy to add/remove/change the input files and re-create the output summary file.

# Create bar graph of mean and sd lengths for each sample

library(ggplot2)

ggplot(stats_out, aes(x = ID, y = mean_length)) +
  geom_bar(stat = "identity", color = "white", fill = "lightblue") +
  geom_errorbar(aes(ymin = mean_length - sd_length,
                    ymax = mean_length + sd_length),
                width = 0.2, color = "darkgrey") +
  labs(title = "Zooplankton lengths in each sample",
       x = "Sample ID",
       y = "Zooplankton mean length (mm)")


Back to Home Page