From 9c4fa981ca256e489d531e5d5d7435684afb76f8 Mon Sep 17 00:00:00 2001 From: junikimm717 Date: Mon, 19 Jul 2021 20:33:19 -0400 Subject: [PATCH] cleaned up codebase --- specification.md | 19 +++++- src/script.def.R | 162 ++++++++++++++++++++++++++++++----------------- 2 files changed, 123 insertions(+), 58 deletions(-) diff --git a/specification.md b/specification.md index cc3103a..5a63177 100644 --- a/specification.md +++ b/specification.md @@ -62,7 +62,24 @@ isotopes. ## Data Summary -- Flags for certainty of halogenated compounds +Per row: + +- Fragment number +- Number of peaks +- top 5 (configurable) most intense m/z values. +- Averaged time output + +### Derivations for above + +- Fragment number - trivial +- Number of each isotope (check group counts per isotope) +- most intense - loop through the fragment and isolate. (Preserve all three categories) +- Averaged time output. + +## Output num 2 + +- each cluster of elements, dump into a file (cluster number). + # Test Notes diff --git a/src/script.def.R b/src/script.def.R index b54aec7..f4e0555 100644 --- a/src/script.def.R +++ b/src/script.def.R @@ -1,15 +1,14 @@ library("nontarget") -library("purrr") library("enviPat") library("stringr") library("parallel") # Configurations ############################################################# # file : decides which file to read in data from ############################# -file <- "/path/to/script" +file <- "/path/to/csv" -# All isotopes to search for -search_isos <- c("37Cl", "81Br") +# All isotopes to check for +check_isos <- c("37Cl", "81Br", "34S") # Minimum size of a cluster min_cluster_size <- 2 @@ -28,50 +27,49 @@ columns.intensity <- "Intensity" # Column name for fragment numbers (only numbers accepted) columns.spectra <- "Spectra_Number" -############################################################################## -# Script +# output options +output.intensities <- 5 + +# Verbose output +verbose.enable <- TRUE +verbose.outputdir <- "/output/dir" + + +# Script ##################################################################### +## Defining/setting variables ################################################ + +# C-13 is required in the rules of pattern.search, so it must be included. ### +search_isos <- check_isos iso_length <- length(search_isos) if (!("13C" %in% search_isos)) { search_isos <- append(search_isos, "13C") } -# Read in the Table ########################################################## +# Read in the Table and sort by spectra ID table <- read.table(file, header=TRUE, sep=",") - -# Sort table by spectra ID table <- table[order(table[,columns.spectra]),] -# Organize the tables by number ############################################## +# Organize the tables by number fragments <- max(table[,columns.spectra]) - -# The algorithm below guarantees linear complexity of looking up data points.# - # set use_cores to safe amount use_cores = min(use_cores, detectCores()-1) -# minint is the lower bound of the interval. ################################# -# maxint is the upper bound of the interval that contains the fragment. ###### -# usable memoizes as to whether or not each fragment number exists. ##### -minint <- unlist(map(1:fragments, function(x) 0)) -maxint <- unlist(map(1:fragments, function(x) 0)) -usable <- unlist(map(1:fragments, function(x) FALSE)) +# lower bound of indices per fragment +# (sorting guarantees that data points of same fragment are together) +minint <- rep(0,fragments) +# upper bound of indices per fragment +maxint <- rep(0,fragments) +# memoizes as to +usable <- rep(FALSE,fragments) +# initialize isotopes. +data(isotopes) +isos <- make.isos(isotopes,use_isotopes=search_isos, + use_charges=rep(1, length(search_isos))) + -# Set all of the intervals ################################################### -for (i in seq(1, nrow(table))) { - fragment <- table[i, columns.spectra] - if (! usable[fragment]) { - minint[fragment] <- i - } - maxint[fragment] <- max(maxint[fragment], i) - usable[fragment] <- TRUE -} - -for (i in 1:fragments) { - if (maxint[i] - minint[i] + 1 < min_cluster_size) - usable[i] = FALSE -} +## Setting functions ######################################################### getdata <- function(fragment, key) { if (! usable[fragment]) { @@ -81,21 +79,18 @@ getdata <- function(fragment, key) { return(table[minint[fragment]:maxint[fragment],key]) } -# Add all data frames as necessary for evaluation. ############################ +# Add all data frames as necessary for evaluation. +# Note: for pattern.search to work, the data must be ordered in +# mz, intensity, time. getdataframe <- function (fragment) { mz <- getdata(fragment, columns.mz) time <- getdata(fragment, columns.time) Intensity <- getdata(fragment, columns.intensity) - # Must be indexed in this order. return(data.frame(mz=mz, Intensity=Intensity,time=time)) } -# initialize isotopes. -data(isotopes) -isos <- make.isos(isotopes,use_isotopes=search_isos, - use_charges=rep(1, length(search_isos))) - # returns both pattern.search result and fragment number. + diagnostics <- function(fragment) { points <- getdataframe(fragment) ptrn <- pattern.search( @@ -108,35 +103,88 @@ diagnostics <- function(fragment) { mzfrac=0.1, ppm=TRUE, inttol=0.05, - # Do not modify anything below. rules=rep(TRUE,11), - #rules=c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE), deter=FALSE, entry=50 ); return(c(ptrn, fragment)) } -use <- Filter(function(x) usable[x], 1:fragments) +# checks if the result is positive. -results <- mclapply(use, diagnostics, mc.cores=use_cores) +positive <- function(result) { + res <- result$`Counts of isotopes` + v <- res[res$"isotope" %in% check_isos,"peak counts"] + return (!all(v %in% c(0))) +} -handle_res <- function(result) { - allzero <- function(v) { - for (x in v) { - if (x != 0) - return (FALSE) - } - return (TRUE) - } - v <- result$`Counts of isotopes`[seq(1,iso_length),"peak counts"] - if (!allzero(v)) { - print(result[[13]]) +derive.fragment <- function(result) { + return(result[[13]]) +} + +derive.iso <- function(result) { + res <- result$`Counts of isotopes` + return (res[res$"isotope" %in% check_isos,c("isotope", "group counts")]) +} + +derive.intensity <- function(result) { + ptrn <- result$Patterns + N <- max(output.intensities, nrow(result)) + inds <- order(ptrn[,columns.intensity], decreasing=TRUE)[1:N] + return(ptrn[inds, c(2, 1, 3)]) +} + +derive.average <- function(result) { + ptrn <- result$Patterns[,2] + return(mean(ptrn)) +} + +derive.frame <- function(result) { + res <- list( + derive.fragment(result), + derive.iso(result), + derive.intensity(result), + derive.average(result) + ) + names(res) <- c("Fragment_ID", "Peaks", "Intensities", "Average_time") + return(res) +} + +# Execution ################################################################## + +## Set all of the intervals ################################################## + +for (i in seq(1, nrow(table))) { + fragment <- table[i, columns.spectra] + if (! usable[fragment]) { + minint[fragment] <- i } + maxint[fragment] <- max(maxint[fragment], i) + usable[fragment] <- TRUE +} + +# make fragment unusable if it does not contain enough data points. +for (i in 1:fragments) { + if (maxint[i] - minint[i] + 1 < min_cluster_size) + usable[i] = FALSE } -for (result in results) { - handle_res(result) +# Filter all usable fragments +use <- Filter(function(x) usable[x], 1:fragments) + +# Apply diagnostics function to all usable fragments (in parallel). +results <- mclapply(use, diagnostics, mc.cores=use_cores) + +results.positive <- Filter(function(x) positive(x), results) + +result.frame <- mapply(derive.frame, results.positive) +result.fragment <- mapply(derive.fragment, results.positive) +result.iso <- mapply(derive.iso, results.positive) +result.intensity <- mapply(derive.intensity, results.positive) +result.average <- mapply(derive.average, results.positive) + +for (x in result.frame) { + print(x) } ##############################################################################