From 21b89f4e663ccd6569433ac016b3be6caa740c07 Mon Sep 17 00:00:00 2001 From: junikimm717 Date: Mon, 19 Jul 2021 15:05:17 -0400 Subject: [PATCH] added README and streamlined configuration --- README.md | 19 +++++++++++++++++++ src/script.def.R | 39 ++++++++++++++++++++++++++------------- 2 files changed, 45 insertions(+), 13 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..a6db25c --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +# Patternmatch + +This is a short R pattern.search wrapper that takes in a CSV file containing +HRMS Peaks and prints all of the fragment numbers that may contain isotopes of one's +choosing (preferably halogenated compounds). + +## Installation + +The script has the following dependencies: +- latest version of the [Nontarget R Package](https://github.com/blosloos/nontarget) (This should be installed through the devtools package) +- [purrr](https://www.rdocumentation.org/packages/purrr/versions/0.2.5) +- [enviPat](https://rdocumentation.org/packages/enviPat/versions/2.2) +- [stringr](https://www.rdocumentation.org/packages/stringr/versions/1.4.0) +- [parallel](https://www.rdocumentation.org/packages/parallel/versions/3.6.2) + +## Additional Notes + +In order to make the search run in linear time, as of right now, the entire +data table will be sorted before searching. diff --git a/src/script.def.R b/src/script.def.R index 2e2ab29..b54aec7 100644 --- a/src/script.def.R +++ b/src/script.def.R @@ -6,22 +6,31 @@ library("parallel") # Configurations ############################################################# # file : decides which file to read in data from ############################# -file <- "/home/junikim/programming/patternmatch/data/allcluster_mz.csv" -#file <- "/home/junikim/programming/patternmatch/data/15_Clusters_for_Tuning_29June21.txt" - -# check cluster 2846 +file <- "/path/to/script" +# All isotopes to search for search_isos <- c("37Cl", "81Br") - # Minimum size of a cluster - min_cluster_size <- 2 -# Number of cores to be used (will be adjusted if not possible) +# Number of cores to be used (will be adjusted if it exceeds the true number of cores) use_cores <- 6 -# Do not edit below. +# Table name configuration + +# Column name for m/z +columns.mz <- "mz" +# Column name for time in gc +columns.time <- "time" +# Column name for intensities +columns.intensity <- "Intensity" +# Column name for fragment numbers (only numbers accepted) +columns.spectra <- "Spectra_Number" + +############################################################################## +# Script + iso_length <- length(search_isos) if (!("13C" %in% search_isos)) { search_isos <- append(search_isos, "13C") @@ -29,8 +38,11 @@ if (!("13C" %in% search_isos)) { # Read in the Table ########################################################## table <- read.table(file, header=TRUE, sep=",") +# Sort table by spectra ID +table <- table[order(table[,columns.spectra]),] + # Organize the tables by number ############################################## -fragments <- max(table[,"Spectra_Number"]) +fragments <- max(table[,columns.spectra]) # The algorithm below guarantees linear complexity of looking up data points.# @@ -48,7 +60,7 @@ usable <- unlist(map(1:fragments, function(x) FALSE)) # Set all of the intervals ################################################### for (i in seq(1, nrow(table))) { - fragment <- table[i, "Spectra_Number"] + fragment <- table[i, columns.spectra] if (! usable[fragment]) { minint[fragment] <- i } @@ -71,9 +83,10 @@ getdata <- function(fragment, key) { # Add all data frames as necessary for evaluation. ############################ getdataframe <- function (fragment) { - mz <- getdata(fragment, "mz") - time <- getdata(fragment, "time") - Intensity <- getdata(fragment, "Intensity") + mz <- getdata(fragment, columns.mz) + time <- getdata(fragment, columns.time) + Intensity <- getdata(fragment, columns.intensity) + # Must be indexed in this order. return(data.frame(mz=mz, Intensity=Intensity,time=time)) }