You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

135 lines
4.0 KiB

  1. import csv
  2. """
  3. Required fields when inserting into the database.
  4. """
  5. _required_fields = [
  6. # the "str" type means that this field can be any valid string.
  7. ("metabolite_name", "str"),
  8. ("formula", "str"),
  9. ("person_name", "str"),
  10. # any field labeled a "float" needs to have a value in decimal notation.
  11. ("mass", "float"),
  12. ("final_mz", "float"),
  13. ("final_rt", "float"),
  14. ("final_adduct", "str"),
  15. ("standard_grp", "str"),
  16. ("person_name", "str"),
  17. ("msms_detected", "yesno"), # Value can either be "Yes" or "No"
  18. ("inchikey", "str"),
  19. ]
  20. """
  21. Optional fields and corresponding types when batch inserting into the database.
  22. """
  23. _optional_fields = [
  24. ("chemical_db_id", "str"),
  25. ("library", "str"),
  26. ("pubchem_cid", "int"), # Only integers are permitted.
  27. ("pubmed_refcount", "int"),
  28. ("standard_class", "str"),
  29. ("inchikey14", "str"),
  30. ("adduct", "str"),
  31. ("detected_adducts", "str"),
  32. ("adduct_calc_mz", "str"),
  33. ("msms_purity", "float"),
  34. ]
  35. """
  36. All fields (excluding those that are commented) are mandatory to include.
  37. """
  38. _query_fields = [
  39. ("rt_min", "float"),
  40. ("rt_max", "float"),
  41. ("mz_min", "float"),
  42. ("mz_max", "float"),
  43. # ("year_max", "int"),
  44. # ("day_max", "int"),
  45. # ("month_max", "int"),
  46. ]
  47. def _validate_type(field: str, value: str, t):
  48. if t == "yesno":
  49. l = value.strip().lower()
  50. if l == "yes":
  51. return True
  52. elif l == "no":
  53. return False
  54. else:
  55. raise ValueError(
  56. f"Yes/No field {field} does not have a valid value {value}")
  57. elif t == "int":
  58. try:
  59. return int(value)
  60. except ValueError:
  61. raise ValueError(
  62. f"Integer field {field} does not have a valid value {value}")
  63. elif t == "float":
  64. try:
  65. return float(value)
  66. except ValueError:
  67. raise ValueError(
  68. f"Float field {field} does not have a valid value {value}")
  69. elif t == "str":
  70. return value
  71. else:
  72. raise ValueError("Impossible")
  73. def validate_insertion_csv_fields(reader: csv.DictReader) -> tuple[list[dict], str]:
  74. chemicals: list[dict] = []
  75. for row in reader:
  76. chemical = {}
  77. for field, t in _required_fields:
  78. if field not in row:
  79. return [], f"Required field \"{field}\" not present in csv"
  80. try:
  81. value = _validate_type(field, row[field], t)
  82. chemical[field] = value
  83. except ValueError as e:
  84. return [], str(e)
  85. for field, t in _optional_fields:
  86. if field not in row:
  87. continue
  88. try:
  89. value = _validate_type(field, row[field], t)
  90. chemical[field] = value
  91. except ValueError as e:
  92. return [], str(e)
  93. chemicals.append(chemical)
  94. return chemicals, ""
  95. def validate_query_csv_fields(reader: csv.DictReader) -> tuple[list[dict], str]:
  96. queries: list[dict] = []
  97. for row in reader:
  98. query = {}
  99. for field, t in _query_fields:
  100. if field not in row:
  101. return [], f"Required field \"{field}\" not present in csv"
  102. try:
  103. value = _validate_type(field, row[field], t)
  104. query[field] = value
  105. except ValueError as e:
  106. return [], str(e)
  107. # year_max, month_max, day_max = query.get(
  108. # 'year_max'), query.get('month_max'), query.get('day_max')
  109. # try:
  110. # d = date(year_max, month_max, day_max)
  111. # query["date"] = d
  112. # except ValueError as e:
  113. # return [], f"Invalid Date Value Provided for {month_max}/{day_max}/{year_max}"
  114. queries.append(query)
  115. return queries, ""