""" tei.py — Parse a RELAX NG schema and produce a TEISchema for use with annotate(). """ from __future__ import annotations from pathlib import Path from lxml import etree from .models.schema import TEIAttribute, TEIElement, TEISchema # Namespace URIs _RNG_NS = "http://relaxng.org/ns/structure/1.0" _A_NS = "http://relaxng.org/ns/compatibility/annotations/1.0" _RNG = f"{{{_RNG_NS}}}" def _local(tag: str) -> str: """Strip the Clark-notation namespace from a tag, returning just the local name.""" return tag.split("}", 1)[1] if "}" in tag else tag def _get_doc(node: etree._Element) -> str: """Return the text of the first a:documentation child of *node*, or ''.""" doc_tag = f"{{{_A_NS}}}documentation" for child in node: if isinstance(child.tag, str) and child.tag == doc_tag: return (child.text or "").strip() return "" # --------------------------------------------------------------------------- # Build lookup tables from the parsed RNG tree # --------------------------------------------------------------------------- def _build_defines(root: etree._Element) -> dict[str, etree._Element]: """Map every in the schema to its element node.""" defines: dict[str, etree._Element] = {} for define in root.iter(f"{_RNG}define"): name = define.get("name") if name: defines[name] = define return defines def _build_elem_to_define(defines: dict[str, etree._Element]) -> dict[str, str]: """Map TEI element names (e.g. 'persName') to their RNG define names.""" result: dict[str, str] = {} for def_name, def_node in defines.items(): elem = def_node.find(f"{_RNG}element") if elem is not None: tei_name = elem.get("name") if tei_name: result[tei_name] = def_name return result # --------------------------------------------------------------------------- # Content-model traversal: collect child element names # --------------------------------------------------------------------------- def _collect_child_elements( node: etree._Element, defines: dict[str, etree._Element], visited: set[str], ) -> list[str]: """ Walk *node* (an RNG define or structural node) and return the names of all TEI elements that can appear as immediate children according to the content model. - Follows nodes recursively to expand macros and model classes. - Stops at boundaries (their inner content is not traversed). - Skips attribute-group refs (those whose name contains ".att" or "att."). """ result: list[str] = [] for child in node: if not isinstance(child.tag, str): continue local = _local(child.tag) if local == "element": # This RNG defines a TEI child element — record its name. name = child.get("name") if name: result.append(name) # Do NOT recurse into the element's own content (those are grandchildren). elif local == "ref": ref_name = child.get("name", "") if not ref_name: continue # Skip attribute-group references — they contribute no child elements. if "att." in ref_name or ref_name not in defines: continue target = defines[ref_name] # If the define directly wraps a TEI , record its name and stop — # do NOT recurse into the element's own content (those are grandchildren). # This also handles self-referential elements (e.g. idno containing idno). elem_child = target.find(f"{_RNG}element") if elem_child is not None: name = elem_child.get("name") if name: result.append(name) elif ref_name not in visited: # It's a model/macro group — recurse to expand it. visited.add(ref_name) result.extend(_collect_child_elements(target, defines, visited)) elif local in ("notAllowed", "empty", "text", "data", "param", "value", "attribute"): # Terminal or attribute nodes — no child elements here. continue else: # Structural wrappers: choice, group, interleave, optional, # zeroOrMore, oneOrMore, grammar, … result.extend(_collect_child_elements(child, defines, visited)) return result # --------------------------------------------------------------------------- # Attribute traversal: collect TEIAttribute instances # --------------------------------------------------------------------------- def _collect_attributes( node: etree._Element, defines: dict[str, etree._Element], visited: set[str], ) -> list[TEIAttribute]: """ Walk *node* and collect all elements, following only attribute-group nodes (those whose name contains "att."). Inline elements directly in the element definition are also picked up. """ result: list[TEIAttribute] = [] for child in node: if not isinstance(child.tag, str): continue local = _local(child.tag) if local == "attribute": attr = _parse_attribute(child, node) if attr is not None: result.append(attr) elif local == "ref": ref_name = child.get("name", "") if not ref_name or ref_name in visited: continue # Only follow attribute-group refs. if "att." not in ref_name: continue if ref_name not in defines: continue visited.add(ref_name) result.extend(_collect_attributes(defines[ref_name], defines, visited)) elif local == "element": # Don't descend into nested element definitions. continue else: # Structural wrappers: optional, choice, group, etc. result.extend(_collect_attributes(child, defines, visited)) return result def _parse_attribute( attr_node: etree._Element, parent: etree._Element, ) -> TEIAttribute | None: """ Convert an RNG element to a TEIAttribute. *parent* is the direct parent of *attr_node* in the RNG tree; it is used to determine whether the attribute is required (i.e. not wrapped in or ). """ name = attr_node.get("name") if not name: return None description = _get_doc(attr_node) required = _local(parent.tag) not in ("optional", "zeroOrMore") # Collect explicit enumerated values from a inside the attribute. allowed_values: list[str] | None = None for choice_node in attr_node.iter(f"{_RNG}choice"): values = [ v.text.strip() for v in choice_node.findall(f"{_RNG}value") if v.text and v.text.strip() ] if values: allowed_values = values break return TEIAttribute( name=name, description=description, required=required, allowed_values=allowed_values, ) # --------------------------------------------------------------------------- # Build a single TEIElement # --------------------------------------------------------------------------- def _build_tei_element( elem_name: str, defines: dict[str, etree._Element], elem_to_def: dict[str, str], ) -> TEIElement | None: """Construct a TEIElement for the named TEI element, or None if not found.""" def_name = elem_to_def.get(elem_name) if not def_name: return None def_node = defines[def_name] elem_node = def_node.find(f"{_RNG}element") if elem_node is None: return None description = _get_doc(elem_node) # Child elements (content-model expansion). child_visited: set[str] = {def_name} children = _collect_child_elements(elem_node, defines, child_visited) # Attributes (attribute-group expansion + inline attributes). attr_visited: set[str] = {def_name} attributes = _collect_attributes(elem_node, defines, attr_visited) # Deduplicate children and attributes while preserving order. seen_children: set[str] = set() unique_children: list[str] = [] for c in children: if c not in seen_children: seen_children.add(c) unique_children.append(c) seen_attrs: set[str] = set() unique_attrs: list[TEIAttribute] = [] for a in attributes: if a.name not in seen_attrs: seen_attrs.add(a.name) unique_attrs.append(a) return TEIElement( tag=elem_name, description=description, allowed_children=unique_children, attributes=unique_attrs, ) # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def create_schema( schema_path: str | Path, element: str = "text", depth: int = 1, ) -> TEISchema: """ Parse a RELAX NG (.rng) schema and build a :class:`TEISchema` suitable for passing to :func:`tei_annotator.annotate`. Parameters ---------- schema_path: Path to the ``.rng`` file. element: Name of the root TEI element to start from (default: ``"text"``). depth: How many levels of descendant elements to include in the returned schema. ``depth=1`` includes the root element **and** its direct children; ``depth=2`` also includes their children; and so on. Returns ------- TEISchema A :class:`TEISchema` containing :class:`TEIElement` entries for the root element and all descendants up to *depth* levels deep. Raises ------ ValueError If *element* is not defined in the schema. """ tree = etree.parse(str(schema_path)) root = tree.getroot() defines = _build_defines(root) elem_to_def = _build_elem_to_define(defines) if element not in elem_to_def: raise ValueError( f"Element '{element}' not found in schema '{schema_path}'. " f"Available elements: {sorted(elem_to_def)[:10]} …" ) # BFS: process level by level up to `depth` levels deep. tei_elements: list[TEIElement] = [] seen: set[str] = set() current_level: list[str] = [element] for level in range(depth + 1): next_level: list[str] = [] for elem_name in current_level: if elem_name in seen: continue seen.add(elem_name) tei_elem = _build_tei_element(elem_name, defines, elem_to_def) if tei_elem is None: continue tei_elements.append(tei_elem) # Queue children for the next level (if we haven't hit the depth limit). if level < depth: next_level.extend(tei_elem.allowed_children) current_level = next_level return TEISchema(elements=tei_elements)