You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

133 lines
4.0 KiB

  1. import csv
  2. from datetime import date
  3. """
  4. Required fields when inserting into the database.
  5. """
  6. _required_fields = [
  7. # the "str" type means that this field can be any valid string.
  8. ("name", "str"),
  9. ("formula", "str"),
  10. # any field labeled a "float" needs to have a value in decimal notation.
  11. ("mass", "float"),
  12. ("final_mz", "float"),
  13. ("final_rt", "float"),
  14. ]
  15. """
  16. Optional fields and corresponding types when batch inserting into the database.
  17. """
  18. _optional_fields = [
  19. ("chemical_db_id", "str"),
  20. ("library", "str"),
  21. ("pubchem_cid", "int"), # Only integers are permitted.
  22. ("pubmed_refcount", "int"),
  23. ("standard_class", "str"),
  24. ("inchikey", "str"),
  25. ("inchikey14", "str"),
  26. ("final_adduct", "str"),
  27. ("adduct", "str"),
  28. ("detected_adducts", "str"),
  29. ("adduct_calc_mz", "str"),
  30. ("msms_detected", "yesno"), # Value can either be "Yes" or "No"
  31. ("msms_purity", "float"),
  32. ]
  33. """
  34. All fields (excluding those that are commented) are mandatory to include.
  35. """
  36. _query_fields = [
  37. ("rt_min", "float"),
  38. ("rt_max", "float"),
  39. ("mz_min", "float"),
  40. ("mz_max", "float"),
  41. # ("year_max", "int"),
  42. # ("day_max", "int"),
  43. # ("month_max", "int"),
  44. ]
  45. def _validate_type(field: str, value: str, t):
  46. if t == "yesno":
  47. l = value.strip().lower()
  48. if l == "yes":
  49. return True
  50. elif l == "no":
  51. return False
  52. else:
  53. raise ValueError(
  54. f"Yes/No field {field} does not have a valid value {value}")
  55. elif t == "int":
  56. try:
  57. return int(value)
  58. except ValueError:
  59. raise ValueError(
  60. f"Integer field {field} does not have a valid value {value}")
  61. elif t == "float":
  62. try:
  63. return float(value)
  64. except ValueError:
  65. raise ValueError(
  66. f"Float field {field} does not have a valid value {value}")
  67. elif t == "str":
  68. return value
  69. else:
  70. raise ValueError("Impossible")
  71. def validate_insertion_csv_fields(reader: csv.DictReader) -> tuple[list[dict], str]:
  72. chemicals: list[dict] = []
  73. for row in reader:
  74. chemical = {}
  75. for field, t in _required_fields:
  76. if field not in row:
  77. return [], f"Required field \"{field}\" not present in csv"
  78. try:
  79. value = _validate_type(field, row[field], t)
  80. chemical[field] = value
  81. except ValueError as e:
  82. return [], str(e)
  83. for field, t in _optional_fields:
  84. if field not in row:
  85. continue
  86. try:
  87. value = _validate_type(field, row[field], t)
  88. chemical[field] = value
  89. except ValueError as e:
  90. return [], str(e)
  91. chemicals.append(chemical)
  92. return chemicals, ""
  93. def validate_query_csv_fields(reader: csv.DictReader) -> tuple[list[dict], str]:
  94. queries: list[dict] = []
  95. for row in reader:
  96. query = {}
  97. for field, t in _query_fields:
  98. if field not in row:
  99. return [], f"Required field \"{field}\" not present in csv"
  100. try:
  101. value = _validate_type(field, row[field], t)
  102. query[field] = value
  103. except ValueError as e:
  104. return [], str(e)
  105. # year_max, month_max, day_max = query.get(
  106. # 'year_max'), query.get('month_max'), query.get('day_max')
  107. # try:
  108. # d = date(year_max, month_max, day_max)
  109. # query["date"] = d
  110. # except ValueError as e:
  111. # return [], f"Invalid Date Value Provided for {month_max}/{day_max}/{year_max}"
  112. queries.append(query)
  113. return queries, ""