You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

134 lines
4.0 KiB

  1. import csv
  2. """
  3. Required fields when inserting into the database.
  4. """
  5. _required_fields = [
  6. # the "str" type means that this field can be any valid string.
  7. ("name", "str"),
  8. ("formula", "str"),
  9. # any field labeled a "float" needs to have a value in decimal notation.
  10. ("mass", "float"),
  11. ("final_mz", "float"),
  12. ("final_rt", "float"),
  13. ("final_adduct", "str"),
  14. ("standard_grp", "str"),
  15. ("uploaded_by", "str"),
  16. ]
  17. """
  18. Optional fields and corresponding types when batch inserting into the database.
  19. """
  20. _optional_fields = [
  21. ("chemical_db_id", "str"),
  22. ("library", "str"),
  23. ("pubchem_cid", "int"), # Only integers are permitted.
  24. ("pubmed_refcount", "int"),
  25. ("standard_class", "str"),
  26. ("inchikey", "str"),
  27. ("inchikey14", "str"),
  28. ("adduct", "str"),
  29. ("detected_adducts", "str"),
  30. ("adduct_calc_mz", "str"),
  31. ("msms_detected", "yesno"), # Value can either be "Yes" or "No"
  32. ("msms_purity", "float"),
  33. ]
  34. """
  35. All fields (excluding those that are commented) are mandatory to include.
  36. """
  37. _query_fields = [
  38. ("rt_min", "float"),
  39. ("rt_max", "float"),
  40. ("mz_min", "float"),
  41. ("mz_max", "float"),
  42. # ("year_max", "int"),
  43. # ("day_max", "int"),
  44. # ("month_max", "int"),
  45. ]
  46. def _validate_type(field: str, value: str, t):
  47. if t == "yesno":
  48. l = value.strip().lower()
  49. if l == "yes":
  50. return True
  51. elif l == "no":
  52. return False
  53. else:
  54. raise ValueError(
  55. f"Yes/No field {field} does not have a valid value {value}")
  56. elif t == "int":
  57. try:
  58. return int(value)
  59. except ValueError:
  60. raise ValueError(
  61. f"Integer field {field} does not have a valid value {value}")
  62. elif t == "float":
  63. try:
  64. return float(value)
  65. except ValueError:
  66. raise ValueError(
  67. f"Float field {field} does not have a valid value {value}")
  68. elif t == "str":
  69. return value
  70. else:
  71. raise ValueError("Impossible")
  72. def validate_insertion_csv_fields(reader: csv.DictReader) -> tuple[list[dict], str]:
  73. chemicals: list[dict] = []
  74. for row in reader:
  75. chemical = {}
  76. for field, t in _required_fields:
  77. if field not in row:
  78. return [], f"Required field \"{field}\" not present in csv"
  79. try:
  80. value = _validate_type(field, row[field], t)
  81. chemical[field] = value
  82. except ValueError as e:
  83. return [], str(e)
  84. for field, t in _optional_fields:
  85. if field not in row:
  86. continue
  87. try:
  88. value = _validate_type(field, row[field], t)
  89. chemical[field] = value
  90. except ValueError as e:
  91. return [], str(e)
  92. chemicals.append(chemical)
  93. return chemicals, ""
  94. def validate_query_csv_fields(reader: csv.DictReader) -> tuple[list[dict], str]:
  95. queries: list[dict] = []
  96. for row in reader:
  97. query = {}
  98. for field, t in _query_fields:
  99. if field not in row:
  100. return [], f"Required field \"{field}\" not present in csv"
  101. try:
  102. value = _validate_type(field, row[field], t)
  103. query[field] = value
  104. except ValueError as e:
  105. return [], str(e)
  106. # year_max, month_max, day_max = query.get(
  107. # 'year_max'), query.get('month_max'), query.get('day_max')
  108. # try:
  109. # d = date(year_max, month_max, day_max)
  110. # query["date"] = d
  111. # except ValueError as e:
  112. # return [], f"Invalid Date Value Provided for {month_max}/{day_max}/{year_max}"
  113. queries.append(query)
  114. return queries, ""