Skip to content

Preprocessor API

axetract.preprocessor.axe_preprocessor.AXEPreprocessor

Bases: BasePreprocessor

Component for fetching and chunking HTML content.

Attributes:

Name Type Description
fetch_workers int

Number of parallel threads for fetching URLs.

cpu_workers int

Number of parallel processes/threads for cleaning and chunking.

extra_remove_tags List[str]

Additional HTML tags to remove.

strip_attrs bool

Whether to remove all tag attributes.

strip_links bool

Whether to replace tags with text.

keep_tags bool

Whether to preserve HTML tags in the output.

use_clean_rag bool

Whether to use htmlrag for cleaning.

use_clean_chunker bool

Whether the chunker should expect clean HTML.

chunk_size int

Targeted token size for each chunk.

attr_cutoff_len int

Length threshold for attribute retention.

disable_chunking bool

Whether to skip the chunking step.

Source code in src/axetract/preprocessor/axe_preprocessor.py
class AXEPreprocessor(BasePreprocessor):
    """Component for fetching and chunking HTML content.

    Attributes:
        fetch_workers (int): Number of parallel threads for fetching URLs.
        cpu_workers (int): Number of parallel processes/threads for cleaning and chunking.
        extra_remove_tags (List[str], optional): Additional HTML tags to remove.
        strip_attrs (bool): Whether to remove all tag attributes.
        strip_links (bool): Whether to replace <a> tags with text.
        keep_tags (bool): Whether to preserve HTML tags in the output.
        use_clean_rag (bool): Whether to use htmlrag for cleaning.
        use_clean_chunker (bool): Whether the chunker should expect clean HTML.
        chunk_size (int): Targeted token size for each chunk.
        attr_cutoff_len (int): Length threshold for attribute retention.
        disable_chunking (bool): Whether to skip the chunking step.
    """

    def __init__(
        self,
        name: str = "AXEPreprocessor",
        fetch_workers: int = mp.cpu_count(),
        cpu_workers: int = mp.cpu_count(),
        extra_remove_tags: List[str] | None = None,
        strip_attrs: bool = True,
        strip_links: bool = True,
        keep_tags: bool = True,
        use_clean_rag: bool = True,
        use_clean_chunker: bool = True,
        chunk_size: int = 2000,
        attr_cutoff_len: int = 5,
        disable_chunking: bool = False,
    ):
        """Initialize the preprocessor.

        Args:
            name (str): Component name.
            fetch_workers (int): Fetching thread count.
            cpu_workers (int): Cleaning process count.
            extra_remove_tags (List[str], optional): Tags to strip.
            strip_attrs (bool): Strip attributes flag.
            strip_links (bool): Strip <a> tags flag.
            keep_tags (bool): Keep HTML tags flag.
            use_clean_rag (bool): Use htmlrag flag.
            use_clean_chunker (bool): Clean chunker flag.
            chunk_size (int): Chunk token limit.
            attr_cutoff_len (int): Attribute length limit.
            disable_chunking (bool): Disable chunking flag.
        """
        super().__init__(name)
        self.fetch_workers = fetch_workers
        self.cpu_workers = cpu_workers

        self.extra_remove_tags = extra_remove_tags if extra_remove_tags is not None else ["header", "footer"]
        self.strip_attrs = strip_attrs
        self.strip_links = strip_links
        self.keep_tags = keep_tags
        self.use_clean_rag = use_clean_rag
        self.use_clean_chunker = use_clean_chunker
        self.chunk_size = chunk_size
        self.attr_cutoff_len = attr_cutoff_len
        self.disable_chunking = disable_chunking

    def __call__(self, samples: list[AXESample] | AXESample) -> List[AXESample]:
        """Fetch, clean, and chunk a batch of samples.

        Args:
            samples (list[AXESample]): Input samples (URLs or raw HTML).

        Returns:
            list[AXESample]: Samples with chunks populated.
        """
        if isinstance(samples, AXESample):
            samples = [samples]

        n = len(samples)
        if n == 0:
            return []

        def _quick_fetch(sample: AXESample) -> AXESample:
            if sample.is_content_url:
                try:
                    fetched = fetch_content(sample.content)
                    sample.content = fetched
                    sample.is_content_url = False
                    return sample
                except Exception as e:
                    sample.content = f"[Fetch ERROR] {e}"
                    sample.is_content_url = False
                    return sample
            else:
                return sample

        # Fetching the content if there are any URLs
        with ThreadPoolExecutor(max_workers=min(self.fetch_workers, max(1, n))) as tpool:
            samples = list(tpool.map(_quick_fetch, samples))

        results = [None] * n
        # choose executor class and max_workers
        use_processes = bool(self.cpu_workers and self.cpu_workers > 1)
        executor_cls = ProcessPoolExecutor if use_processes else ThreadPoolExecutor

        if use_processes:
            max_workers = min(self.cpu_workers or n, max(1, n))
        else:
            # for IO-bound thread fallback, use fetch_workers if provided else at most n
            max_workers = min(self.fetch_workers or n, max(1, n))

        # prepare enumerated args (Sample, config, index)
        items = [(samples[i], self, i) for i in range(n)]

        results: list[Dict] = [None] * n # type: ignore[assignment]

        with executor_cls(max_workers=max_workers) as ex:
            for idx, res in enumerate(ex.map(_chunk_worker, items)):
                results[idx] = res

        for i, res in enumerate(results):
            samples[i].chunks = [
                AXEChunk(chunkid=chunk["chunkid"], content=chunk["chunkcontent"])
                for chunk in res["chunks"]
            ]

        return samples

__call__(samples)

Fetch, clean, and chunk a batch of samples.

Parameters:

Name Type Description Default
samples list[AXESample]

Input samples (URLs or raw HTML).

required

Returns:

Type Description
List[AXESample]

list[AXESample]: Samples with chunks populated.

Source code in src/axetract/preprocessor/axe_preprocessor.py
def __call__(self, samples: list[AXESample] | AXESample) -> List[AXESample]:
    """Fetch, clean, and chunk a batch of samples.

    Args:
        samples (list[AXESample]): Input samples (URLs or raw HTML).

    Returns:
        list[AXESample]: Samples with chunks populated.
    """
    if isinstance(samples, AXESample):
        samples = [samples]

    n = len(samples)
    if n == 0:
        return []

    def _quick_fetch(sample: AXESample) -> AXESample:
        if sample.is_content_url:
            try:
                fetched = fetch_content(sample.content)
                sample.content = fetched
                sample.is_content_url = False
                return sample
            except Exception as e:
                sample.content = f"[Fetch ERROR] {e}"
                sample.is_content_url = False
                return sample
        else:
            return sample

    # Fetching the content if there are any URLs
    with ThreadPoolExecutor(max_workers=min(self.fetch_workers, max(1, n))) as tpool:
        samples = list(tpool.map(_quick_fetch, samples))

    results = [None] * n
    # choose executor class and max_workers
    use_processes = bool(self.cpu_workers and self.cpu_workers > 1)
    executor_cls = ProcessPoolExecutor if use_processes else ThreadPoolExecutor

    if use_processes:
        max_workers = min(self.cpu_workers or n, max(1, n))
    else:
        # for IO-bound thread fallback, use fetch_workers if provided else at most n
        max_workers = min(self.fetch_workers or n, max(1, n))

    # prepare enumerated args (Sample, config, index)
    items = [(samples[i], self, i) for i in range(n)]

    results: list[Dict] = [None] * n # type: ignore[assignment]

    with executor_cls(max_workers=max_workers) as ex:
        for idx, res in enumerate(ex.map(_chunk_worker, items)):
            results[idx] = res

    for i, res in enumerate(results):
        samples[i].chunks = [
            AXEChunk(chunkid=chunk["chunkid"], content=chunk["chunkcontent"])
            for chunk in res["chunks"]
        ]

    return samples

__init__(name='AXEPreprocessor', fetch_workers=mp.cpu_count(), cpu_workers=mp.cpu_count(), extra_remove_tags=None, strip_attrs=True, strip_links=True, keep_tags=True, use_clean_rag=True, use_clean_chunker=True, chunk_size=2000, attr_cutoff_len=5, disable_chunking=False)

Initialize the preprocessor.

Parameters:

Name Type Description Default
name str

Component name.

'AXEPreprocessor'
fetch_workers int

Fetching thread count.

cpu_count()
cpu_workers int

Cleaning process count.

cpu_count()
extra_remove_tags List[str]

Tags to strip.

None
strip_attrs bool

Strip attributes flag.

True
strip_links bool True
keep_tags bool

Keep HTML tags flag.

True
use_clean_rag bool

Use htmlrag flag.

True
use_clean_chunker bool

Clean chunker flag.

True
chunk_size int

Chunk token limit.

2000
attr_cutoff_len int

Attribute length limit.

5
disable_chunking bool

Disable chunking flag.

False
Source code in src/axetract/preprocessor/axe_preprocessor.py
def __init__(
    self,
    name: str = "AXEPreprocessor",
    fetch_workers: int = mp.cpu_count(),
    cpu_workers: int = mp.cpu_count(),
    extra_remove_tags: List[str] | None = None,
    strip_attrs: bool = True,
    strip_links: bool = True,
    keep_tags: bool = True,
    use_clean_rag: bool = True,
    use_clean_chunker: bool = True,
    chunk_size: int = 2000,
    attr_cutoff_len: int = 5,
    disable_chunking: bool = False,
):
    """Initialize the preprocessor.

    Args:
        name (str): Component name.
        fetch_workers (int): Fetching thread count.
        cpu_workers (int): Cleaning process count.
        extra_remove_tags (List[str], optional): Tags to strip.
        strip_attrs (bool): Strip attributes flag.
        strip_links (bool): Strip <a> tags flag.
        keep_tags (bool): Keep HTML tags flag.
        use_clean_rag (bool): Use htmlrag flag.
        use_clean_chunker (bool): Clean chunker flag.
        chunk_size (int): Chunk token limit.
        attr_cutoff_len (int): Attribute length limit.
        disable_chunking (bool): Disable chunking flag.
    """
    super().__init__(name)
    self.fetch_workers = fetch_workers
    self.cpu_workers = cpu_workers

    self.extra_remove_tags = extra_remove_tags if extra_remove_tags is not None else ["header", "footer"]
    self.strip_attrs = strip_attrs
    self.strip_links = strip_links
    self.keep_tags = keep_tags
    self.use_clean_rag = use_clean_rag
    self.use_clean_chunker = use_clean_chunker
    self.chunk_size = chunk_size
    self.attr_cutoff_len = attr_cutoff_len
    self.disable_chunking = disable_chunking

axetract.preprocessor.base_preprocessor.BasePreprocessor

Bases: ABC

Abstract base class for all preprocessors.

Source code in src/axetract/preprocessor/base_preprocessor.py
class BasePreprocessor(ABC):
    """Abstract base class for all preprocessors."""

    def __init__(self, name: str):
        """Initialize the preprocessor.

        Args:
            name (str): Component name.
        """
        self.name = name

    @abstractmethod
    def __call__(self, samples: List[AXESample]) -> List[AXESample]:
        """Process a batch of samples.

        Args:
            samples (List[AXESample]): Input samples.

        Returns:
            List[AXESample]: Processed samples.
        """
        raise NotImplementedError

__call__(samples) abstractmethod

Process a batch of samples.

Parameters:

Name Type Description Default
samples List[AXESample]

Input samples.

required

Returns:

Type Description
List[AXESample]

List[AXESample]: Processed samples.

Source code in src/axetract/preprocessor/base_preprocessor.py
@abstractmethod
def __call__(self, samples: List[AXESample]) -> List[AXESample]:
    """Process a batch of samples.

    Args:
        samples (List[AXESample]): Input samples.

    Returns:
        List[AXESample]: Processed samples.
    """
    raise NotImplementedError

__init__(name)

Initialize the preprocessor.

Parameters:

Name Type Description Default
name str

Component name.

required
Source code in src/axetract/preprocessor/base_preprocessor.py
def __init__(self, name: str):
    """Initialize the preprocessor.

    Args:
        name (str): Component name.
    """
    self.name = name