"""Module for handling taxonomies."""
from __future__ import annotations
import datetime
import json
from functools import lru_cache
from pathlib import Path
from typing import Any
from jsonschema import validate
from jsonschema.exceptions import ValidationError
from typing_extensions import Literal
TAXONOMY_LEVEL = Literal["stage_plus"]
FIELD_TYPE = Literal[
"number",
"categorical",
"text",
"string",
"multi_categorical",
"date-time",
"date",
"polygon",
]
type_mapping = {
"number": int,
"text": str,
"string": str,
"date-time": str,
"date": str,
"categorical": str,
"multi_categorical": list,
"polygon": list,
}
jsonschema_type_mapping = {
"number": "integer",
"text": "string",
"string": "string",
"date-time": "string",
"date": "string",
"categorical": "string",
"multi_categorical": "array",
"polygon": "array",
}
@lru_cache(maxsize=1)
def _get_geojson_geometry_schema() -> dict:
"""Lazy load the GeoJSON feature schema."""
return json.loads(
(Path(__file__).parent / "json_schema/geojson_geometry_schema.json").read_text()
)
[docs]class TaxonomyComplianceError(Exception):
"""Exception raised for data that does not comply with the taxonomy."""
[docs]class TaxonomyCategoricalValue:
"""Class representing a value in a taxonomy field."""
value: str
label: str | None
description: str | None
def __init__(self, value_dict: dict):
"""Initialize the TaxonomyCategoricalValue from a dictionary."""
if not isinstance(value_dict, dict):
raise TypeError("value_dict must be a dictionary.")
if "value" not in value_dict:
raise TypeError("value_dict must contain a 'value' key.")
self.value = value_dict["value"]
self.label = value_dict.get("label")
self.description = value_dict.get("description")
[docs]class TaxonomyField:
"""Class representing a field in a taxonomy."""
field_id: str
type: FIELD_TYPE
label: str
values: list[TaxonomyCategoricalValue] | None
description: str | None
tags: list[str] | None
def __init__(self, field_dict: dict, strict: bool = True):
"""
Initialize the TaxonomyField from a dictionary.
Raises:
TypeError: If field_dict is not valid
ValueError: If field_dict is missing required fields
"""
if not isinstance(field_dict, dict):
raise TypeError("field_dict must be a dictionary.")
if "field" not in field_dict or "type" not in field_dict or "label" not in field_dict:
raise TypeError("field_dict must contain 'field', 'type', and 'label' keys.")
if field_dict["type"] not in FIELD_TYPE.__args__:
raise TypeError(
f"Unrecognized field type: {field_dict['type']}."
f"Must be one of {FIELD_TYPE.__args__}."
)
self.field_id = field_dict["field"]
self.type = field_dict["type"]
self.label = field_dict["label"]
self.description = field_dict.get("description")
self.tags = field_dict.get("tags")
self.values = None
if "values" in field_dict:
if not isinstance(field_dict["values"], list):
raise TypeError("field_dict['values'] must be a list.")
self.values = [TaxonomyCategoricalValue(v) for v in field_dict["values"]]
if self.type in ["categorical", "multi_categorical"] and not self.values and strict:
raise ValueError(
f"Field of type '{self.type}' must have 'values' defined '{self.field_id}'."
)
@property
def __json_schema__(self) -> dict:
"""Build the JSON schema definition for this field."""
field_schema: dict = {"title": self.label, "type": None}
if self.description:
field_schema["description"] = self.description
field_schema["type"] = jsonschema_type_mapping[self.type]
if self.type in ("date-time", "date"):
field_schema["format"] = "date-time"
elif self.type in ("categorical", "multi_categorical"):
if self.values:
categorical_options = [
{
"const": val.value,
"title": val.label or val.value,
**({"description": val.description} if val.description else {}),
}
for val in self.values
]
if self.type == "categorical":
field_schema["oneOf"] = categorical_options
else:
field_schema["items"] = {"type": "string", "oneOf": categorical_options}
elif self.type == "polygon":
field_schema = {
"type": "array",
"items": {
"type": "object",
"properties": {"geometry": {"$ref": "#/definitions/geojson_geometry"}},
"required": ["geometry"],
},
"description": "A list of GeoJSON features representing polygons.",
}
return field_schema
[docs] def validate(self, value: Any): # noqa: ANN401
"""
Validate a single field value against its type and constraints.
Args:
value: The value to validate
Raises:
TaxonomyComplianceError: If value doesn't comply
"""
# TODO: Should we enforce required fields?
# https://shintolabs.atlassian.net/browse/DOT-755
if value is None:
raise TaxonomyComplianceError(f"Field '{self.field_id}' is required but missing.")
if not isinstance(value, type_mapping[self.type]):
raise TaxonomyComplianceError(
f"Field '{self.field_id}' expects type {type_mapping[self.type].__name__}, "
f"got {type(value).__name__}: {value}"
)
if self.type == "multi_categorical":
self._validate_multi_categorical(value)
elif self.type == "categorical":
self._validate_categorical(value)
elif self.type in ("date-time", "date"):
self._validate_date_time(value)
if self.type == "polygon":
self._validate_polygon(value)
def _validate_multi_categorical(self, value: Any): # noqa: ANN401
"""Validate multi_categorical field value."""
for v in value:
if v not in [val.value for val in self.values]:
raise TaxonomyComplianceError(
f"Field '{self.field_id}' has invalid value '{v}'. "
f"Allowed values are {[val.value for val in self.values]}"
)
def _validate_categorical(self, value: Any): # noqa: ANN401
"""Validate categorical field value."""
if value not in [val.value for val in self.values]:
raise TaxonomyComplianceError(
f"Field '{self.field_id}' has invalid value '{value}'. "
f"Allowed values are {[val.value for val in self.values]}"
)
def _validate_date_time(self, value: Any): # noqa: ANN401
"""Validate date-time field value."""
try:
datetime.datetime.fromisoformat(value)
except ValueError as e:
raise TaxonomyComplianceError(
f"Field '{self.field_id}' expects a date-time in ISO 8601 format, got: {value}"
) from e
def _validate_polygon(self, value: Any): # noqa: ANN401
"""Validate polygon field value."""
# TODO: Is polygon always a list of GeoJSON features?
# https://shintolabs.atlassian.net/browse/DOT-755
try:
for item in value:
if "geometry" not in item:
raise TaxonomyComplianceError(
f"Field '{self.field_id}' contains invalid GeoJSON polygon: {value}"
)
validate(item["geometry"], _get_geojson_geometry_schema())
except ValidationError as e:
raise TaxonomyComplianceError(
f"Field '{self.field_id}' contains invalid GeoJSON polygon: {value}"
) from e
[docs]class Taxonomy:
"""Class representing a taxonomy."""
level: TAXONOMY_LEVEL | None
fields: list[TaxonomyField]
def __init__(self, taxonomy_dict: dict, strict: bool = True):
"""
Initialize the Taxonomy from a dictionary.
Raises:
TypeError: If taxonomy_dict is not valid
ValueError: If taxonomy_dict is missing required fields
"""
if not isinstance(taxonomy_dict, dict):
raise TypeError("taxonomy_dict must be a dictionary.")
if "fields" not in taxonomy_dict:
raise TypeError("taxonomy_dict must contain a 'fields' key.")
if not isinstance(taxonomy_dict["fields"], list):
raise TypeError("taxonomy_dict['fields'] must be a list.")
if len(taxonomy_dict["fields"]) == 0:
raise ValueError("taxonomy_dict['fields'] must contain at least one field.")
self.level = taxonomy_dict.get("level")
self.fields = [TaxonomyField(field_dict, strict) for field_dict in taxonomy_dict["fields"]]
[docs] def validate_data(self, data: dict):
"""
Validate data against the taxonomy.
Args:
data: Project data dictionary (with potential 'stages' array)
Raises:
TaxonomyComplianceError: If value doesn't comply
"""
for field in self.fields:
# TODO: Or are fields always required?
# https://shintolabs.atlassian.net/browse/DOT-755
if field.field_id in data:
field.validate(data[field.field_id])
if "stages" in data:
for idx, stage in enumerate(data["stages"]):
if field.field_id in stage:
try:
field.validate(stage[field.field_id])
except TaxonomyComplianceError as e:
raise TaxonomyComplianceError(
f"Failed to validate field '{field.field_id}' in stage {idx} "
) from e
@property
def __json_schema__(self) -> dict:
"""Convert a taxonomy to a JSON schema."""
# TODO: Are any fields required in the data/schema?
# https://shintolabs.atlassian.net/browse/DOT-755
definitions = {
"taxonomy_field_properties": {
"type": "object",
"properties": {field.field_id: field.__json_schema__ for field in self.fields},
"required": [],
}
}
if any(field.type == "polygon" for field in self.fields):
definitions["geojson_geometry"] = _get_geojson_geometry_schema()
return {
"title": "Taxonomy Data Schema",
"type": "object",
"allOf": [{"$ref": "#/definitions/taxonomy_field_properties"}],
"properties": {
"stages": {
"type": "array",
"items": {"$ref": "#/definitions/taxonomy_field_properties"},
}
},
"required": [],
"definitions": definitions,
}