cleaned up codebase

3 years ago · 9c4fa981ca
2 changed files with 123 additions and 58 deletions
--- a/specification.md
+++ b/specification.md
@ -62,7 +62,24 @@ isotopes.

 ## Data Summary

- Flags for certainty of halogenated compounds
+Per row:
+
+- Fragment number
+- Number of peaks
+- top 5 (configurable) most intense m/z values.
+- Averaged time output
+
+### Derivations for above
+
+- Fragment number - trivial
+- Number of each isotope (check group counts per isotope)
+- most intense - loop through the fragment and isolate. (Preserve all three categories)
+- Averaged time output.
+
+## Output num 2
+
+- each cluster of elements, dump into a file (cluster number).
+

 # Test Notes

--- a/src/script.def.R
+++ b/src/script.def.R
@ -1,15 +1,14 @@
 library("nontarget")
-library("purrr")
 library("enviPat")
 library("stringr")
 library("parallel")

 # Configurations #############################################################
 # file : decides which file to read in data from #############################
-file <- "/path/to/script"
+file <- "/path/to/csv"

-# All isotopes to search for
-search_isos <- c("37Cl", "81Br")
+# All isotopes to check for
+check_isos <- c("37Cl", "81Br", "34S")

 # Minimum size of a cluster
 min_cluster_size <- 2
@ -28,50 +27,49 @@ columns.intensity <- "Intensity"
 # Column name for fragment numbers (only numbers accepted)
 columns.spectra <- "Spectra_Number"

-##############################################################################
-# Script
+# output options
+output.intensities <- 5
+
+# Verbose output
+verbose.enable <- TRUE
+verbose.outputdir <- "/output/dir"
+
+
+# Script #####################################################################

+## Defining/setting variables ################################################
+
+# C-13 is required in the rules of pattern.search, so it must be included. ###
+search_isos <- check_isos
 iso_length <- length(search_isos)
 if (!("13C" %in% search_isos)) {
    search_isos <- append(search_isos, "13C")
 }
-# Read in the Table ##########################################################
+# Read in the Table and sort by spectra ID
 table <- read.table(file, header=TRUE, sep=",")
-
-# Sort table by spectra ID
 table <- table[order(table[,columns.spectra]),]

-# Organize the tables by number ##############################################
+# Organize the tables by number
 fragments <- max(table[,columns.spectra])

-
-# The algorithm below guarantees linear complexity of looking up data points.#
-
 # set use_cores to safe amount
 use_cores = min(use_cores, detectCores()-1)

-# minint is the lower bound of the interval. #################################
-# maxint is the upper bound of the interval that contains the fragment. ######
-# usable memoizes as to whether or not each fragment number exists. #####
-minint <- unlist(map(1:fragments, function(x) 0))
-maxint <- unlist(map(1:fragments, function(x) 0))
-usable <- unlist(map(1:fragments, function(x) FALSE))
+# lower bound of indices per fragment 
+# (sorting guarantees that data points of same fragment are together)
+minint <- rep(0,fragments)
+# upper bound of indices per fragment
+maxint <- rep(0,fragments)
+# memoizes as to 
+usable <- rep(FALSE,fragments)

+# initialize isotopes.
+data(isotopes)
+isos <- make.isos(isotopes,use_isotopes=search_isos, 
+                  use_charges=rep(1, length(search_isos)))
                  
-# Set all of the intervals ###################################################
-for (i in seq(1, nrow(table))) {
-    fragment <- table[i, columns.spectra]
-    if (! usable[fragment]) {
-        minint[fragment] <- i
-    }
-    maxint[fragment] <- max(maxint[fragment], i)
-    usable[fragment] <- TRUE
-}

-for (i in 1:fragments) {
-    if (maxint[i] - minint[i] + 1 < min_cluster_size)
-        usable[i] = FALSE
-}
+## Setting functions #########################################################

 getdata <- function(fragment, key) {
    if (! usable[fragment]) {
@ -81,21 +79,18 @@ getdata <- function(fragment, key) {
    return(table[minint[fragment]:maxint[fragment],key])
 }

-# Add all data frames as necessary for evaluation. ############################
+# Add all data frames as necessary for evaluation.
+# Note: for pattern.search to work, the data must be ordered in
+# mz, intensity, time.
 getdataframe <- function (fragment) {
    mz <- getdata(fragment, columns.mz)
    time <- getdata(fragment, columns.time)
    Intensity <- getdata(fragment, columns.intensity)
-    # Must be indexed in this order.
    return(data.frame(mz=mz, Intensity=Intensity,time=time))
 }

-# initialize isotopes.
-data(isotopes)
-isos <- make.isos(isotopes,use_isotopes=search_isos, 
-                  use_charges=rep(1, length(search_isos)))
-
 # returns both pattern.search result and fragment number.
+
 diagnostics <- function(fragment) {
    points <- getdataframe(fragment)
    ptrn <- pattern.search(
@ -108,35 +103,88 @@ diagnostics <- function(fragment) {
        mzfrac=0.1,
        ppm=TRUE,
        inttol=0.05,
-        # Do not modify anything below.
        rules=rep(TRUE,11),
-        #rules=c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE),
        deter=FALSE,
        entry=50
    );
    return(c(ptrn, fragment))
 }

-use <- Filter(function(x) usable[x], 1:fragments)
+# checks if the result is positive.

-results <- mclapply(use, diagnostics, mc.cores=use_cores)
+positive <- function(result) {
+    res <- result$`Counts of isotopes`
+    v <- res[res$"isotope" %in% check_isos,"peak counts"]
+    return (!all(v %in% c(0)))
+}

-handle_res <- function(result) {
-    allzero <- function(v) {
-        for (x in v) {
-            if (x != 0)
-                return (FALSE)
-        }
-        return (TRUE)
-    }
-    v <- result$`Counts of isotopes`[seq(1,iso_length),"peak counts"]
-    if (!allzero(v)) {
-        print(result[[13]])
+derive.fragment <- function(result) {
+    return(result[[13]])
+}
+
+derive.iso <- function(result) {
+    res <- result$`Counts of isotopes`
+    return (res[res$"isotope" %in% check_isos,c("isotope", "group counts")])
+}
+
+derive.intensity <- function(result) {
+    ptrn <- result$Patterns
+    N <- max(output.intensities, nrow(result))
+    inds <- order(ptrn[,columns.intensity], decreasing=TRUE)[1:N]
+    return(ptrn[inds, c(2, 1, 3)])
+}
+
+derive.average <- function(result) {
+    ptrn <- result$Patterns[,2]
+    return(mean(ptrn))
+}
+
+derive.frame <- function(result) {
+    res <- list(
+            derive.fragment(result),
+            derive.iso(result),
+            derive.intensity(result),
+            derive.average(result)
+        )
+    names(res) <- c("Fragment_ID", "Peaks", "Intensities", "Average_time")
+    return(res)
+}
+
+# Execution ##################################################################
+
+## Set all of the intervals ##################################################
+
+for (i in seq(1, nrow(table))) {
+    fragment <- table[i, columns.spectra]
+    if (! usable[fragment]) {
+        minint[fragment] <- i
    }
+    maxint[fragment] <- max(maxint[fragment], i)
+    usable[fragment] <- TRUE
+}
+
+# make fragment unusable if it does not contain enough data points.
+for (i in 1:fragments) {
+    if (maxint[i] - minint[i] + 1 < min_cluster_size)
+        usable[i] = FALSE
 }

-for (result in results) {
-    handle_res(result)
+# Filter all usable fragments
+use <- Filter(function(x) usable[x], 1:fragments)
+
+# Apply diagnostics function to all usable fragments (in parallel).
+results <- mclapply(use, diagnostics, mc.cores=use_cores)
+
+results.positive <- Filter(function(x) positive(x), results)
+
+result.frame <- mapply(derive.frame, results.positive)
+result.fragment <- mapply(derive.fragment, results.positive)
+result.iso <- mapply(derive.iso, results.positive)
+result.intensity <- mapply(derive.intensity, results.positive)
+result.average <- mapply(derive.average, results.positive)
+
+for (x in result.frame) {
+    print(x)
 }

 ##############################################################################