You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

133 lines
4.0 KiB

  1. import csv
  2. """
  3. Required fields when inserting into the database.
  4. """
  5. _required_fields = [
  6. # the "str" type means that this field can be any valid string.
  7. ("metabolite_name", "str"),
  8. ("formula", "str"),
  9. # any field labeled a "float" needs to have a value in decimal notation.
  10. ("mass", "float"),
  11. ("final_mz", "float"),
  12. ("final_rt", "float"),
  13. ("final_adduct", "str"),
  14. ("standard_grp", "str"),
  15. ("msms_detected", "yesno"), # Value can either be "Yes" or "No"
  16. ("inchikey", "str"),
  17. ]
  18. """
  19. Optional fields and corresponding types when batch inserting into the database.
  20. """
  21. _optional_fields = [
  22. ("chemical_db_id", "str"),
  23. ("library", "str"),
  24. ("pubchem_cid", "int"), # Only integers are permitted.
  25. ("pubmed_refcount", "int"),
  26. ("standard_class", "str"),
  27. ("inchikey14", "str"),
  28. ("adduct", "str"),
  29. ("detected_adducts", "str"),
  30. ("adduct_calc_mz", "str"),
  31. ("msms_purity", "float"),
  32. ]
  33. """
  34. All fields (excluding those that are commented) are mandatory to include.
  35. """
  36. _query_fields = [
  37. ("rt_min", "float"),
  38. ("rt_max", "float"),
  39. ("mz_min", "float"),
  40. ("mz_max", "float"),
  41. # ("year_max", "int"),
  42. # ("day_max", "int"),
  43. # ("month_max", "int"),
  44. ]
  45. def _validate_type(field: str, value: str, t):
  46. if t == "yesno":
  47. l = value.strip().lower()
  48. if l == "yes":
  49. return True
  50. elif l == "no":
  51. return False
  52. else:
  53. raise ValueError(
  54. f"Yes/No field {field} does not have a valid value {value}")
  55. elif t == "int":
  56. try:
  57. return int(value)
  58. except ValueError:
  59. raise ValueError(
  60. f"Integer field {field} does not have a valid value {value}")
  61. elif t == "float":
  62. try:
  63. return float(value)
  64. except ValueError:
  65. raise ValueError(
  66. f"Float field {field} does not have a valid value {value}")
  67. elif t == "str":
  68. return value
  69. else:
  70. raise ValueError("Impossible")
  71. def validate_insertion_csv_fields(reader: csv.DictReader) -> tuple[list[dict], str]:
  72. chemicals: list[dict] = []
  73. for row in reader:
  74. chemical = {}
  75. for field, t in _required_fields:
  76. if field not in row:
  77. return [], f"Required field \"{field}\" not present in csv"
  78. try:
  79. value = _validate_type(field, row[field], t)
  80. chemical[field] = value
  81. except ValueError as e:
  82. return [], str(e)
  83. for field, t in _optional_fields:
  84. if field not in row:
  85. continue
  86. try:
  87. value = _validate_type(field, row[field], t)
  88. chemical[field] = value
  89. except ValueError as e:
  90. return [], str(e)
  91. chemicals.append(chemical)
  92. return chemicals, ""
  93. def validate_query_csv_fields(reader: csv.DictReader) -> tuple[list[dict], str]:
  94. queries: list[dict] = []
  95. for row in reader:
  96. query = {}
  97. for field, t in _query_fields:
  98. if field not in row:
  99. return [], f"Required field \"{field}\" not present in csv"
  100. try:
  101. value = _validate_type(field, row[field], t)
  102. query[field] = value
  103. except ValueError as e:
  104. return [], str(e)
  105. # year_max, month_max, day_max = query.get(
  106. # 'year_max'), query.get('month_max'), query.get('day_max')
  107. # try:
  108. # d = date(year_max, month_max, day_max)
  109. # query["date"] = d
  110. # except ValueError as e:
  111. # return [], f"Invalid Date Value Provided for {month_max}/{day_max}/{year_max}"
  112. queries.append(query)
  113. return queries, ""