Source code for pharia_skill.csi.chunking

from dataclasses import asdict
from typing import Any

from pydantic import model_serializer

# We use pydantic.dataclasses to get type validation.
# See the docstring of `csi` module for more information on the why.
from pydantic.dataclasses import dataclass



[docs]
@dataclass
class ChunkParams:
    """Chunking parameters.

    Attributes:
        model (str, required): The name of the model the chunk is intended to be used for. This must be a known model.
        max_tokens (int, required): The maximum number of tokens that should be returned per chunk.
        overlap (int, optional, default 0): The amount of allowed overlap between chunks. Must be less than max_tokens. By default, there is no overlap between chunks.
    """

    model: str
    max_tokens: int
    overlap: int = 0




[docs]
@dataclass
class ChunkRequest:
    """Chunking request parameters.

    Attributes:
        text (str, required): The text to be chunked.
        params (ChunkParams, required): Parameter used for chunking.
    """

    text: str
    params: ChunkParams


[docs]
    @model_serializer()
    def serialize(self) -> dict[str, Any]:
        return {
            "text": self.text,
            "params": asdict(self.params),
            "character_offsets": True,
        }





[docs]
@dataclass
class Chunk:
    """Chunk object with offset information.

    Attributes:
        text (str, required): The text that was chunked
        character_offset (int, required): The character offset relative to the start of the original text
    """

    text: str
    character_offset: int