Preprocessor API

`axetract.preprocessor.axe_preprocessor.AXEPreprocessor`

Bases: BasePreprocessor

Component for fetching and chunking HTML content.

Attributes:

Name	Type	Description
`fetch_workers`	`int`	Number of parallel threads for fetching URLs.
`cpu_workers`	`int`	Number of parallel processes/threads for cleaning and chunking.
`extra_remove_tags`	`List[str]`	Additional HTML tags to remove.
`strip_attrs`	`bool`	Whether to remove all tag attributes.
`strip_links`	`bool`	Whether to replace tags with text.
`keep_tags`	`bool`	Whether to preserve HTML tags in the output.
`use_clean_rag`	`bool`	Whether to use htmlrag for cleaning.
`use_clean_chunker`	`bool`	Whether the chunker should expect clean HTML.
`chunk_size`	`int`	Targeted token size for each chunk.
`attr_cutoff_len`	`int`	Length threshold for attribute retention.
`disable_chunking`	`bool`	Whether to skip the chunking step.

Source code in src/axetract/preprocessor/axe_preprocessor.py

class AXEPreprocessor(BasePreprocessor):
    """Component for fetching and chunking HTML content.

    Attributes:
        fetch_workers (int): Number of parallel threads for fetching URLs.
        cpu_workers (int): Number of parallel processes/threads for cleaning and chunking.
        extra_remove_tags (List[str], optional): Additional HTML tags to remove.
        strip_attrs (bool): Whether to remove all tag attributes.
        strip_links (bool): Whether to replace <a> tags with text.
        keep_tags (bool): Whether to preserve HTML tags in the output.
        use_clean_rag (bool): Whether to use htmlrag for cleaning.
        use_clean_chunker (bool): Whether the chunker should expect clean HTML.
        chunk_size (int): Targeted token size for each chunk.
        attr_cutoff_len (int): Length threshold for attribute retention.
        disable_chunking (bool): Whether to skip the chunking step.
    """

    def __init__(
        self,
        name: str = "AXEPreprocessor",
        fetch_workers: int = mp.cpu_count(),
        cpu_workers: int = mp.cpu_count(),
        extra_remove_tags: List[str] | None = None,
        strip_attrs: bool = True,
        strip_links: bool = True,
        keep_tags: bool = True,
        use_clean_rag: bool = True,
        use_clean_chunker: bool = True,
        chunk_size: int = 2000,
        attr_cutoff_len: int = 5,
        disable_chunking: bool = False,
    ):
        """Initialize the preprocessor.

        Args:
            name (str): Component name.
            fetch_workers (int): Fetching thread count.
            cpu_workers (int): Cleaning process count.
            extra_remove_tags (List[str], optional): Tags to strip.
            strip_attrs (bool): Strip attributes flag.
            strip_links (bool): Strip <a> tags flag.
            keep_tags (bool): Keep HTML tags flag.
            use_clean_rag (bool): Use htmlrag flag.
            use_clean_chunker (bool): Clean chunker flag.
            chunk_size (int): Chunk token limit.
            attr_cutoff_len (int): Attribute length limit.
            disable_chunking (bool): Disable chunking flag.
        """
        super().__init__(name)
        self.fetch_workers = fetch_workers
        self.cpu_workers = cpu_workers

        self.extra_remove_tags = extra_remove_tags if extra_remove_tags is not None else ["header", "footer"]
        self.strip_attrs = strip_attrs
        self.strip_links = strip_links
        self.keep_tags = keep_tags
        self.use_clean_rag = use_clean_rag
        self.use_clean_chunker = use_clean_chunker
        self.chunk_size = chunk_size
        self.attr_cutoff_len = attr_cutoff_len
        self.disable_chunking = disable_chunking

    def __call__(self, samples: list[AXESample] | AXESample) -> List[AXESample]:
        """Fetch, clean, and chunk a batch of samples.

        Args:
            samples (list[AXESample]): Input samples (URLs or raw HTML).

        Returns:
            list[AXESample]: Samples with chunks populated.
        """
        if isinstance(samples, AXESample):
            samples = [samples]

        n = len(samples)
        if n == 0:
            return []

        def _quick_fetch(sample: AXESample) -> AXESample:
            if sample.is_content_url:
                try:
                    fetched = fetch_content(sample.content)
                    sample.content = fetched
                    sample.is_content_url = False
                    return sample
                except Exception as e:
                    sample.content = f"[Fetch ERROR] {e}"
                    sample.is_content_url = False
                    return sample
            else:
                return sample

        # Fetching the content if there are any URLs
        with ThreadPoolExecutor(max_workers=min(self.fetch_workers, max(1, n))) as tpool:
            samples = list(tpool.map(_quick_fetch, samples))

        results = [None] * n
        # choose executor class and max_workers
        use_processes = bool(self.cpu_workers and self.cpu_workers > 1)
        executor_cls = ProcessPoolExecutor if use_processes else ThreadPoolExecutor

        if use_processes:
            max_workers = min(self.cpu_workers or n, max(1, n))
        else:
            # for IO-bound thread fallback, use fetch_workers if provided else at most n
            max_workers = min(self.fetch_workers or n, max(1, n))

        # prepare enumerated args (Sample, config, index)
        items = [(samples[i], self, i) for i in range(n)]

        results: list[Dict] = [None] * n # type: ignore[assignment]

        with executor_cls(max_workers=max_workers) as ex:
            for idx, res in enumerate(ex.map(_chunk_worker, items)):
                results[idx] = res

        for i, res in enumerate(results):
            samples[i].chunks = [
                AXEChunk(chunkid=chunk["chunkid"], content=chunk["chunkcontent"])
                for chunk in res["chunks"]
            ]

        return samples

`call(samples)`

Fetch, clean, and chunk a batch of samples.

Parameters:

Name	Type	Description	Default
`samples`	`list[AXESample]`	Input samples (URLs or raw HTML).	required

Returns:

Type	Description
`List[AXESample]`	list[AXESample]: Samples with chunks populated.

Source code in src/axetract/preprocessor/axe_preprocessor.py

def __call__(self, samples: list[AXESample] | AXESample) -> List[AXESample]:
    """Fetch, clean, and chunk a batch of samples.

    Args:
        samples (list[AXESample]): Input samples (URLs or raw HTML).

    Returns:
        list[AXESample]: Samples with chunks populated.
    """
    if isinstance(samples, AXESample):
        samples = [samples]

    n = len(samples)
    if n == 0:
        return []

    def _quick_fetch(sample: AXESample) -> AXESample:
        if sample.is_content_url:
            try:
                fetched = fetch_content(sample.content)
                sample.content = fetched
                sample.is_content_url = False
                return sample
            except Exception as e:
                sample.content = f"[Fetch ERROR] {e}"
                sample.is_content_url = False
                return sample
        else:
            return sample

    # Fetching the content if there are any URLs
    with ThreadPoolExecutor(max_workers=min(self.fetch_workers, max(1, n))) as tpool:
        samples = list(tpool.map(_quick_fetch, samples))

    results = [None] * n
    # choose executor class and max_workers
    use_processes = bool(self.cpu_workers and self.cpu_workers > 1)
    executor_cls = ProcessPoolExecutor if use_processes else ThreadPoolExecutor

    if use_processes:
        max_workers = min(self.cpu_workers or n, max(1, n))
    else:
        # for IO-bound thread fallback, use fetch_workers if provided else at most n
        max_workers = min(self.fetch_workers or n, max(1, n))

    # prepare enumerated args (Sample, config, index)
    items = [(samples[i], self, i) for i in range(n)]

    results: list[Dict] = [None] * n # type: ignore[assignment]

    with executor_cls(max_workers=max_workers) as ex:
        for idx, res in enumerate(ex.map(_chunk_worker, items)):
            results[idx] = res

    for i, res in enumerate(results):
        samples[i].chunks = [
            AXEChunk(chunkid=chunk["chunkid"], content=chunk["chunkcontent"])
            for chunk in res["chunks"]
        ]

    return samples

`init(name='AXEPreprocessor', fetch_workers=mp.cpu_count(), cpu_workers=mp.cpu_count(), extra_remove_tags=None, strip_attrs=True, strip_links=True, keep_tags=True, use_clean_rag=True, use_clean_chunker=True, chunk_size=2000, attr_cutoff_len=5, disable_chunking=False)`

Initialize the preprocessor.

Parameters:

Name	Type	Description	Default
`name`	`str`	Component name.	`'AXEPreprocessor'`
`fetch_workers`	`int`	Fetching thread count.	`cpu_count()`
`cpu_workers`	`int`	Cleaning process count.	`cpu_count()`
`extra_remove_tags`	`List[str]`	Tags to strip.	`None`
`strip_attrs`	`bool`	Strip attributes flag.	`True`
`strip_links`	`bool`	Strip tags flag.	`True`
`keep_tags`	`bool`	Keep HTML tags flag.	`True`
`use_clean_rag`	`bool`	Use htmlrag flag.	`True`
`use_clean_chunker`	`bool`	Clean chunker flag.	`True`
`chunk_size`	`int`	Chunk token limit.	`2000`
`attr_cutoff_len`	`int`	Attribute length limit.	`5`
`disable_chunking`	`bool`	Disable chunking flag.	`False`

Source code in src/axetract/preprocessor/axe_preprocessor.py

def __init__(
    self,
    name: str = "AXEPreprocessor",
    fetch_workers: int = mp.cpu_count(),
    cpu_workers: int = mp.cpu_count(),
    extra_remove_tags: List[str] | None = None,
    strip_attrs: bool = True,
    strip_links: bool = True,
    keep_tags: bool = True,
    use_clean_rag: bool = True,
    use_clean_chunker: bool = True,
    chunk_size: int = 2000,
    attr_cutoff_len: int = 5,
    disable_chunking: bool = False,
):
    """Initialize the preprocessor.

    Args:
        name (str): Component name.
        fetch_workers (int): Fetching thread count.
        cpu_workers (int): Cleaning process count.
        extra_remove_tags (List[str], optional): Tags to strip.
        strip_attrs (bool): Strip attributes flag.
        strip_links (bool): Strip <a> tags flag.
        keep_tags (bool): Keep HTML tags flag.
        use_clean_rag (bool): Use htmlrag flag.
        use_clean_chunker (bool): Clean chunker flag.
        chunk_size (int): Chunk token limit.
        attr_cutoff_len (int): Attribute length limit.
        disable_chunking (bool): Disable chunking flag.
    """
    super().__init__(name)
    self.fetch_workers = fetch_workers
    self.cpu_workers = cpu_workers

    self.extra_remove_tags = extra_remove_tags if extra_remove_tags is not None else ["header", "footer"]
    self.strip_attrs = strip_attrs
    self.strip_links = strip_links
    self.keep_tags = keep_tags
    self.use_clean_rag = use_clean_rag
    self.use_clean_chunker = use_clean_chunker
    self.chunk_size = chunk_size
    self.attr_cutoff_len = attr_cutoff_len
    self.disable_chunking = disable_chunking

`axetract.preprocessor.base_preprocessor.BasePreprocessor`

Bases: ABC

Abstract base class for all preprocessors.

Source code in src/axetract/preprocessor/base_preprocessor.py

class BasePreprocessor(ABC):
    """Abstract base class for all preprocessors."""

    def __init__(self, name: str):
        """Initialize the preprocessor.

        Args:
            name (str): Component name.
        """
        self.name = name

    @abstractmethod
    def __call__(self, samples: List[AXESample]) -> List[AXESample]:
        """Process a batch of samples.

        Args:
            samples (List[AXESample]): Input samples.

        Returns:
            List[AXESample]: Processed samples.
        """
        raise NotImplementedError

`call(samples)` `abstractmethod`

Process a batch of samples.

Parameters:

Name	Type	Description	Default
`samples`	`List[AXESample]`	Input samples.	required

Returns:

Type	Description
`List[AXESample]`	List[AXESample]: Processed samples.

Source code in src/axetract/preprocessor/base_preprocessor.py

@abstractmethod
def __call__(self, samples: List[AXESample]) -> List[AXESample]:
    """Process a batch of samples.

    Args:
        samples (List[AXESample]): Input samples.

    Returns:
        List[AXESample]: Processed samples.
    """
    raise NotImplementedError

`init(name)`

Initialize the preprocessor.

Parameters:

Name	Type	Description	Default
`name`	`str`	Component name.	required

Source code in src/axetract/preprocessor/base_preprocessor.py

def __init__(self, name: str):
    """Initialize the preprocessor.

    Args:
        name (str): Component name.
    """
    self.name = name

Preprocessor API

axetract.preprocessor.axe_preprocessor.AXEPreprocessor

__call__(samples)

__init__(name='AXEPreprocessor', fetch_workers=mp.cpu_count(), cpu_workers=mp.cpu_count(), extra_remove_tags=None, strip_attrs=True, strip_links=True, keep_tags=True, use_clean_rag=True, use_clean_chunker=True, chunk_size=2000, attr_cutoff_len=5, disable_chunking=False)

axetract.preprocessor.base_preprocessor.BasePreprocessor

__call__(samples) abstractmethod

__init__(name)

`axetract.preprocessor.axe_preprocessor.AXEPreprocessor`

`call(samples)`

`init(name='AXEPreprocessor', fetch_workers=mp.cpu_count(), cpu_workers=mp.cpu_count(), extra_remove_tags=None, strip_attrs=True, strip_links=True, keep_tags=True, use_clean_rag=True, use_clean_chunker=True, chunk_size=2000, attr_cutoff_len=5, disable_chunking=False)`

`axetract.preprocessor.base_preprocessor.BasePreprocessor`

`call(samples)` `abstractmethod`

`init(name)`