Skip to content

Postprocessor API

axetract.postprocessor.axe_postprocessor.AXEPostprocessor

Bases: BasePostprocessor

Optimized PostProcessor for high-throughput batch processing.

This component handles JSON parsing, repair, and grounded XPath resolution (GXR) to map extracted values back to the original document.

Uses a parse-once indexing strategy: each document's HTML is parsed into a search index exactly once, and all extracted fields are matched against that index. This eliminates the O(fields × parse_cost) bottleneck.

Attributes:

Name Type Description
name str

Component name.

exact_extraction bool

Whether to perform fuzzy matching to find source XPaths.

Source code in src/axetract/postprocessor/axe_postprocessor.py
class AXEPostprocessor(BasePostprocessor):
    """Optimized PostProcessor for high-throughput batch processing.

    This component handles JSON parsing, repair, and grounded XPath resolution (GXR)
    to map extracted values back to the original document.

    Uses a parse-once indexing strategy: each document's HTML is parsed into a
    search index exactly once, and all extracted fields are matched against that
    index. This eliminates the O(fields × parse_cost) bottleneck.

    Attributes:
        name (str): Component name.
        exact_extraction (bool): Whether to perform fuzzy matching to find source XPaths.
    """

    def __init__(self, name: str = "axe_postprocessor", exact_extraction: bool = True):
        """Initialize the postprocessor.

        Args:
            name (str): Component name.
            exact_extraction (bool): Enable grounded XPath resolution.
        """
        super().__init__(name=name)
        self._exact_extraction = exact_extraction

    def __call__(self, samples: List[AXESample]) -> List[AXESample]:
        """Clean, repair, and ground a batch of extraction samples.

        Args:
            samples (List[AXESample]): Samples with raw LLM predictions.

        Returns:
            List[AXESample]: Samples with structured predictions and XPaths.
        """
        if not samples:
            return samples

        n_items = len(samples)
        # Leave one core free for system stability
        n_workers = max(1, (os.cpu_count() or 2) - 1)

        # Use ThreadPoolExecutor to avoid expensive pickling of large HTML strings.
        # The CPU-bound work (SequenceMatcher) partially releases the GIL, and
        # the index-based approach makes each worker much faster than before.
        executor_cls = ThreadPoolExecutor

        # Prepare flags generator
        extract_flags = [self._exact_extraction] * n_items

        # We assume sample.prediction holds the raw LLM string generated by the extractor
        responses = [
            s.prediction if isinstance(s.prediction, str) else str(s.prediction) for s in samples
        ]
        queries = [s.query or s.schema_model for s in samples]

        # We perform exact_extraction matching using the current HTML processing output
        contents = [s.current_html for s in samples]

        # EXECUTE IN PARALLEL
        with executor_cls(max_workers=n_workers) as ex:
            parsed_results = list(
                ex.map(_safe_extract_worker, responses, contents, queries, extract_flags)
            )

        # Re-assemble results in the main process
        for sample, (parsed, xpaths) in zip(samples, parsed_results):
            sample.prediction = parsed
            sample.xpaths = xpaths

        return samples

__call__(samples)

Clean, repair, and ground a batch of extraction samples.

Parameters:

Name Type Description Default
samples List[AXESample]

Samples with raw LLM predictions.

required

Returns:

Type Description
List[AXESample]

List[AXESample]: Samples with structured predictions and XPaths.

Source code in src/axetract/postprocessor/axe_postprocessor.py
def __call__(self, samples: List[AXESample]) -> List[AXESample]:
    """Clean, repair, and ground a batch of extraction samples.

    Args:
        samples (List[AXESample]): Samples with raw LLM predictions.

    Returns:
        List[AXESample]: Samples with structured predictions and XPaths.
    """
    if not samples:
        return samples

    n_items = len(samples)
    # Leave one core free for system stability
    n_workers = max(1, (os.cpu_count() or 2) - 1)

    # Use ThreadPoolExecutor to avoid expensive pickling of large HTML strings.
    # The CPU-bound work (SequenceMatcher) partially releases the GIL, and
    # the index-based approach makes each worker much faster than before.
    executor_cls = ThreadPoolExecutor

    # Prepare flags generator
    extract_flags = [self._exact_extraction] * n_items

    # We assume sample.prediction holds the raw LLM string generated by the extractor
    responses = [
        s.prediction if isinstance(s.prediction, str) else str(s.prediction) for s in samples
    ]
    queries = [s.query or s.schema_model for s in samples]

    # We perform exact_extraction matching using the current HTML processing output
    contents = [s.current_html for s in samples]

    # EXECUTE IN PARALLEL
    with executor_cls(max_workers=n_workers) as ex:
        parsed_results = list(
            ex.map(_safe_extract_worker, responses, contents, queries, extract_flags)
        )

    # Re-assemble results in the main process
    for sample, (parsed, xpaths) in zip(samples, parsed_results):
        sample.prediction = parsed
        sample.xpaths = xpaths

    return samples

__init__(name='axe_postprocessor', exact_extraction=True)

Initialize the postprocessor.

Parameters:

Name Type Description Default
name str

Component name.

'axe_postprocessor'
exact_extraction bool

Enable grounded XPath resolution.

True
Source code in src/axetract/postprocessor/axe_postprocessor.py
def __init__(self, name: str = "axe_postprocessor", exact_extraction: bool = True):
    """Initialize the postprocessor.

    Args:
        name (str): Component name.
        exact_extraction (bool): Enable grounded XPath resolution.
    """
    super().__init__(name=name)
    self._exact_extraction = exact_extraction

axetract.postprocessor.base_postprocessor.BasePostprocessor

Bases: ABC

Abstract base class for all postprocessors.

Source code in src/axetract/postprocessor/base_postprocessor.py
class BasePostprocessor(ABC):
    """Abstract base class for all postprocessors."""

    def __init__(self, name: str):
        """Initialize the postprocessor.

        Args:
            name (str): Component name.
        """
        self.name = name

    @abstractmethod
    def __call__(self, samples: List[AXESample]) -> List[AXESample]:
        """Postprocess a batch of samples.

        Args:
            samples (List[AXESample]): Input samples.

        Returns:
            List[AXESample]: Postprocessed samples.
        """
        raise NotImplementedError

__call__(samples) abstractmethod

Postprocess a batch of samples.

Parameters:

Name Type Description Default
samples List[AXESample]

Input samples.

required

Returns:

Type Description
List[AXESample]

List[AXESample]: Postprocessed samples.

Source code in src/axetract/postprocessor/base_postprocessor.py
@abstractmethod
def __call__(self, samples: List[AXESample]) -> List[AXESample]:
    """Postprocess a batch of samples.

    Args:
        samples (List[AXESample]): Input samples.

    Returns:
        List[AXESample]: Postprocessed samples.
    """
    raise NotImplementedError

__init__(name)

Initialize the postprocessor.

Parameters:

Name Type Description Default
name str

Component name.

required
Source code in src/axetract/postprocessor/base_postprocessor.py
def __init__(self, name: str):
    """Initialize the postprocessor.

    Args:
        name (str): Component name.
    """
    self.name = name