added README and streamlined configuration

5 years ago · 21b89f4e66
2 changed files with 45 additions and 13 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,19 @@
+# Patternmatch
+
+This is a short R pattern.search wrapper that takes in a CSV file containing
+HRMS Peaks and prints all of the fragment numbers that may contain isotopes of one's
+choosing (preferably halogenated compounds).
+
+## Installation
+
+The script has the following dependencies:
+- latest version of the [Nontarget R Package](https://github.com/blosloos/nontarget) (This should be installed through the devtools package)
+- [purrr](https://www.rdocumentation.org/packages/purrr/versions/0.2.5)
+- [enviPat](https://rdocumentation.org/packages/enviPat/versions/2.2)
+- [stringr](https://www.rdocumentation.org/packages/stringr/versions/1.4.0)
+- [parallel](https://www.rdocumentation.org/packages/parallel/versions/3.6.2)
+
+## Additional Notes
+
+In order to make the search run in linear time, as of right now, the entire
+data table will be sorted before searching.
--- a/src/script.def.R
+++ b/src/script.def.R
@ -6,22 +6,31 @@ library("parallel")

 # Configurations #############################################################
 # file : decides which file to read in data from #############################
-file <- "/home/junikim/programming/patternmatch/data/allcluster_mz.csv"
-#file <- "/home/junikim/programming/patternmatch/data/15_Clusters_for_Tuning_29June21.txt"
-
-# check cluster 2846
+file <- "/path/to/script"

+# All isotopes to search for
 search_isos <- c("37Cl", "81Br")

-
 # Minimum size of a cluster
-
 min_cluster_size <- 2

-# Number of cores to be used (will be adjusted if not possible)
+# Number of cores to be used (will be adjusted if it exceeds the true number of cores)
 use_cores <- 6

-# Do not edit below.
+# Table name configuration
+
+# Column name for m/z
+columns.mz <- "mz"
+# Column name for time in gc
+columns.time <- "time"
+# Column name for intensities
+columns.intensity <- "Intensity"
+# Column name for fragment numbers (only numbers accepted)
+columns.spectra <- "Spectra_Number"
+
+##############################################################################
+# Script
+
 iso_length <- length(search_isos)
 if (!("13C" %in% search_isos)) {
    search_isos <- append(search_isos, "13C")
@ -29,8 +38,11 @@ if (!("13C" %in% search_isos)) {
 # Read in the Table ##########################################################
 table <- read.table(file, header=TRUE, sep=",")

+# Sort table by spectra ID
+table <- table[order(table[,columns.spectra]),]
+
 # Organize the tables by number ##############################################
-fragments <- max(table[,"Spectra_Number"])
+fragments <- max(table[,columns.spectra])


 # The algorithm below guarantees linear complexity of looking up data points.#
@ -48,7 +60,7 @@ usable <- unlist(map(1:fragments, function(x) FALSE))

 # Set all of the intervals ###################################################
 for (i in seq(1, nrow(table))) {
-    fragment <- table[i, "Spectra_Number"]
+    fragment <- table[i, columns.spectra]
    if (! usable[fragment]) {
        minint[fragment] <- i
    }
@ -71,9 +83,10 @@ getdata <- function(fragment, key) {

 # Add all data frames as necessary for evaluation. ############################
 getdataframe <- function (fragment) {
-    mz <- getdata(fragment, "mz")
-    time <- getdata(fragment, "time")
-    Intensity <- getdata(fragment, "Intensity")
+    mz <- getdata(fragment, columns.mz)
+    time <- getdata(fragment, columns.time)
+    Intensity <- getdata(fragment, columns.intensity)
+    # Must be indexed in this order.
    return(data.frame(mz=mz, Intensity=Intensity,time=time))
 }