Homework #11

# Set the path to the main folder containing subfolders
main_folder <- "C:/Users/carlos/Desktop/OriginalData"

# Get the list of subfolders within the main folder
subfolders <- list.dirs(main_folder, full.names = TRUE, recursive = FALSE)

# Define the file pattern
file_pattern <- "NEON.D01.BART.DP1.10003.001.brd_countdata.*\\.csv"

# Create a folder to hold the files of interest
Nfolder <- "C:/Users/carlos/Desktop/NewFolder"
if (!file.exists(Nfolder)) {
  dir.create(Nfolder)
}

# Loop through each subfolder
for (folder in subfolders) {
  # Get the list of files in the subfolder
  files <- list.files(folder, pattern = file_pattern, full.names = TRUE)
  
  # If there are matching files, copy them to the new folder
  if (length(files) > 0) {
    for (file in files) {
      file.copy(file, paste0(Nfolder, "/", basename(file)))
    }
  } else {
    cat("No files matching", file_pattern, "found in", folder, "\n") 
  }
}

## No files matching NEON.D01.BART.DP1.10003.001.brd_countdata.*\.csv found in C:/Users/carlos/Desktop/OriginalData/.Rproj.user 
## No files matching NEON.D01.BART.DP1.10003.001.brd_countdata.*\.csv found in C:/Users/carlos/Desktop/OriginalData/gitsandbox

Starting with pseudo-code, generate functions for 1) Cleaning the data for any empty/missing cases, 2) Extract the year from each file name, 3) Calculate Abundance for each year (Total number of individuals found), 4) Calculate Species Richness for each year(Number of unique species found)

#2 cleaning data
library(stringr)# required to use str_extract()

# Function to clean the data
clean_data <- function(data) {
  data <- data[complete.cases(data$scientificName), ]
  return(data)
}

# Function to extract the year from the file name
extract_year <- function(file_name) {
  year <- str_extract(file_name, "\\d{4}") # Extract only the year
  return(year)
}

# Function to calculate abundance
calculate_abundance <- function(data) {
  abundance <- nrow(data) # Abundance is the number of rows
  return(abundance)
}

# Function to calculate species richness
calculate_species_richness <- function(data) {
  richness <- length(unique(data$scientificName)) # Richness is the number of unique species
  return(richness)
}

# Set the directory where the files are located
n_dir <- "C:/Users/carlos/Desktop/Newfolder"

# Create a list with the files in the working Directory
files <- list.files(n_dir, pattern = "NEON.D01.BART.DP1.10003.001.brd_countdata..*\\.csv$", full.names = TRUE)

# Loop through each file
for (file in files) {
  # Extract the year from the file name
  year <- extract_year(basename(file))
  
  # Read the data from the file
  data <- read.csv(file)
  
  # Clean the data
  cleaned_data <- clean_data(data)
  
  # Calculate abundance
  abundance <- calculate_abundance(cleaned_data)
  
  # Calculate species richness
  richness <- calculate_species_richness(cleaned_data)
  
  # Print results
  cat("File:", basename(file), "\n")
  cat("Year:", year, "\n")
  cat("Abundance:", abundance, "\n")
  cat("Species Richness:", richness, "\n")
  cat("\n")
}

## File: NEON.D01.BART.DP1.10003.001.brd_countdata.2015-06.basic.20231226T232626Z.csv 
## Year: 1000 
## Abundance: 454 
## Species Richness: 40 
## 
## File: NEON.D01.BART.DP1.10003.001.brd_countdata.2016-06.basic.20231227T013428Z.csv 
## Year: 1000 
## Abundance: 883 
## Species Richness: 39 
## 
## File: NEON.D01.BART.DP1.10003.001.brd_countdata.2017-06.basic.20231227T094709Z.csv 
## Year: 1000 
## Abundance: 685 
## Species Richness: 35 
## 
## File: NEON.D01.BART.DP1.10003.001.brd_countdata.2018-06.basic.20231228T172744Z.csv 
## Year: 1000 
## Abundance: 772 
## Species Richness: 37 
## 
## File: NEON.D01.BART.DP1.10003.001.brd_countdata.2019-06.basic.20231227T184129Z.csv 
## Year: 1000 
## Abundance: 628 
## Species Richness: 44 
## 
## File: NEON.D01.BART.DP1.10003.001.brd_countdata.2020-06.basic.20231227T224944Z.csv 
## Year: 1000 
## Abundance: 626 
## Species Richness: 46 
## 
## File: NEON.D01.BART.DP1.10003.001.brd_countdata.2020-07.basic.20231227T225020Z.csv 
## Year: 1000 
## Abundance: 89 
## Species Richness: 18 
## 
## File: NEON.D01.BART.DP1.10003.001.brd_countdata.2021-06.basic.20231228T010546Z.csv 
## Year: 1000 
## Abundance: 1015 
## Species Richness: 50 
## 
## File: NEON.D01.BART.DP1.10003.001.brd_countdata.2022-06.basic.20231229T053256Z.csv 
## Year: 1000 
## Abundance: 699 
## Species Richness: 39

#####Create an initial empty data frame to hold the above summary statistics-you should have 4 columns, one for the file name, one for abundance, one for species richness, and one for year.

# Create a new data frame to store the results
results <- data.frame(File = character(),
                      Date = character(),
                      Abundance = numeric(),
                      Richness = numeric(),
                      stringsAsFactors = FALSE) # Ensuring strings are treated as characters

Using a for loop, run your created functions as a batch process for each folder, changing the working directory as necessary to read in the correct files, calculating summary statistics with your created functions, and then writing them out into your summary statistics data frame.

  # Loop through each file in the folder
  for (file in files) {
    # Extract the year from the file name
    year <- extract_year(basename(file))
    
    # Read the data from the file
    data <- read.csv(file)
    
    # Clean the data
    cleaned_data <- clean_data(data)
    
    # Calculate abundance
    abundance <- calculate_abundance(cleaned_data)
    
    # Calculate species richness
    richness <- calculate_species_richness(cleaned_data)
    
    # Add the results to the data frame
    results <- rbind(results, data.frame(File = basename(file),
                                         Date = year,
                                         Abundance = abundance,
                                         Richness = richness,
                                         stringsAsFactors = FALSE)) # Ensuring strings are treated as characters
  }
# Print the results
print(results)

##                                                                           File
## 1 NEON.D01.BART.DP1.10003.001.brd_countdata.2015-06.basic.20231226T232626Z.csv
## 2 NEON.D01.BART.DP1.10003.001.brd_countdata.2016-06.basic.20231227T013428Z.csv
## 3 NEON.D01.BART.DP1.10003.001.brd_countdata.2017-06.basic.20231227T094709Z.csv
## 4 NEON.D01.BART.DP1.10003.001.brd_countdata.2018-06.basic.20231228T172744Z.csv
## 5 NEON.D01.BART.DP1.10003.001.brd_countdata.2019-06.basic.20231227T184129Z.csv
## 6 NEON.D01.BART.DP1.10003.001.brd_countdata.2020-06.basic.20231227T224944Z.csv
## 7 NEON.D01.BART.DP1.10003.001.brd_countdata.2020-07.basic.20231227T225020Z.csv
## 8 NEON.D01.BART.DP1.10003.001.brd_countdata.2021-06.basic.20231228T010546Z.csv
## 9 NEON.D01.BART.DP1.10003.001.brd_countdata.2022-06.basic.20231229T053256Z.csv
##   Date Abundance Richness
## 1 1000       454       40
## 2 1000       883       39
## 3 1000       685       35
## 4 1000       772       37
## 5 1000       628       44
## 6 1000       626       46
## 7 1000        89       18
## 8 1000      1015       50
## 9 1000       699       39

Homework #11

Carlos Amissah

2024-05-08

Starting with pseudo-code, generate functions for 1) Cleaning the data for any empty/missing cases, 2) Extract the year from each file name, 3) Calculate Abundance for each year (Total number of individuals found), 4) Calculate Species Richness for each year(Number of unique species found)

Using a for loop, run your created functions as a batch process for each folder, changing the working directory as necessary to read in the correct files, calculating summary statistics with your created functions, and then writing them out into your summary statistics data frame.