From bbee9467c3d7d234cab77475766687f518afcbe4 Mon Sep 17 00:00:00 2001
From: junikimm717 <68165832+junikimm717@users.noreply.github.com>
Date: Sat, 5 Jun 2021 22:09:53 -0400
Subject: [PATCH] added parallelization and changed nontarget versions

---
 specification.md | 20 ++++++++++++++++++++
 src/script.def.R | 48 +++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/specification.md b/specification.md
index bcb66a8..c54b863 100644
--- a/specification.md
+++ b/specification.md
@@ -19,3 +19,23 @@ isotopes.
 - Readable by R.
 - Manipulatable by Tidyverse
 - Include it in script?
+
+# Additional Notes
+
+- Independent of the ordering (it might get corrupted)
+- Just search for chlorinated/brominated compounds.
+- usually, use_charges will be 1
+- Headers to use in script.
+
+- Packages to do things in parallel (multiple threads)
+- be able to configure the number of cores or threads used.
+- make it a requirement to be tab-deliminated.
+
+- eliminate clusters with only m/z.
+
+# Notes
+
+- cutint represents high enough intensity.
+- mztol is in terms of ppm.
+- mzfrac is in terms of absolute.
+- PPM = true always
diff --git a/src/script.def.R b/src/script.def.R
index d548fa6..5c96dd0 100644
--- a/src/script.def.R
+++ b/src/script.def.R
@@ -1,44 +1,62 @@
 library("nontarget")
 library("purrr")
+library("funprog")
 library("enviPat")
 library("stringr")
+library("parallel")
 
 # Configurations #############################################################
 # file : decides which file to read in data from #############################
 file <- "/path/to/file"
-# separator: decides separator of file #######################################
-separator <- "separator"
+
+search_isos <- c("13C", "37Cl")
+
+# Minimum size of a cluster
+
+min_cluster_size <- 3
+
+# Number of cores to be used (will be adjusted if not possible)
+used_cores <- 3
 
 ##############################################################################
 # Read in the Table ##########################################################
-table <- read.table(file, header=TRUE, sep=separator)
+table <- read.table(file, header=TRUE, sep=",")
 
 # Organize the tables by number ##############################################
 fragments <- max(table[,"Spectra_Number"])
 
 
+
 # The algorithm below guarantees linear complexity of looking up data points.#
 
+# set use_cores to safe amount
+use_cores = min(use_cores, detectCores()-1)
+
 # minint is the lower bound of the interval. #################################
 # maxint is the upper bound of the interval that contains the fragment. ######
-# initialized memoizes as to whether or not each fragment number exists. #####
+# usable memoizes as to whether or not each fragment number exists. #####
 minint <- unlist(map(1:fragments, function(x) 0))
 maxint <- unlist(map(1:fragments, function(x) 0))
-initialized <- unlist(map(1:fragments, function(x) FALSE))
+usable <- unlist(map(1:fragments, function(x) FALSE))
 
 
 # Set all of the intervals ###################################################
 for (i in seq(1, nrow(table))) {
     fragment <- table[i, "Spectra_Number"]
-    if (! initialized[fragment]) {
+    if (! usable[fragment]) {
         minint[fragment] <- i
     }
     maxint[fragment] <- max(maxint[fragment], i)
-    initialized[fragment] <- TRUE
+    usable[fragment] <- TRUE
+}
+
+for (i in 1:fragments) {
+    if (maxint[i] - minint[i] + 1 < min_cluster_size)
+        usable[i] = FALSE
 }
 
 getdata <- function(fragment, key) {
-    if (! initialized[fragment]) {
+    if (! usable[fragment]) {
         stop(str_interp("Fragment $[d]{fragment} does not exist in the data set", 
                     list(fragment=fragment)))
     }
@@ -55,10 +73,22 @@ getdataframe <- function (fragment) {
 
 # Incomplete: cannot get pattern.search function to work per cluster. #########
 # Incomplete: Get the diagnostics for a cluster and turn them into a portable#
-# format. (such as through tinyverse) ########################################
+# format. (such as through tidyverse) ########################################
+
+isos <- make.isos(isotopes,use_isotopes=search_isos, 
+                  use_charges=rep(1, length(search_isos)))
 
 diagnostics <- function(fragment) {
     points <- getdataframe(fragment)
+    ptrn <- pattern.search(points, isos)
+    return(ptrn)
 }
 
+use <- Filter(function(x) usable[x], 1:fragments)
+
+results <- mclapply(use, diagnostics, mc.cores=use_cores)
+
+# Incomplete: Make the analysis more resiliant to different sorting.
+# Incomplete: How to process results. (is it actually supposed to be all negatives?)
+
 ##############################################################################