Source code for pharia_skill.csi.document_index

import datetime as dt
from dataclasses import asdict, field
from typing import Any, Literal

from pydantic import model_serializer

# We use pydantic.dataclasses to get type validation.
# See the docstring of `csi` module for more information on the why.
from pydantic.dataclasses import dataclass


[docs] @dataclass(frozen=True) class DocumentPath: """Path identifying a document. A DocumentPath consists of a namespace, within the namespace a collection and within the collection a document has a name. A user might want to filter for unique documents. By making `DocumentPath` a frozen dataclass, we ensure that it is hashable and a user can use a set to filter for unique ones before requesting the documents. Attributes: namespace (str): The namespace. collection (str): The collection within the namespace. name (str): The name identifying the document in the collection. """ namespace: str collection: str name: str
[docs] @dataclass class IndexPath: """Which documents you want to search in, and which type of index should be used. Attributes: namespace (string): The namespace the collection belongs to. collection (string): The collection you want to search in. index (str): The search index you want to use for the collection. """ namespace: str collection: str index: str
[docs] @dataclass class GreaterThan: __json_name__ = "greater_than" value: float
[docs] @dataclass class GreaterThanOrEqualTo: __json_name__ = "greater_than_or_equal_to" value: float
[docs] @dataclass class LessThan: __json_name__ = "less_than" value: float
[docs] @dataclass class LessThanOrEqualTo: __json_name__ = "less_than_or_equal_to" value: float
[docs] @dataclass class After: __json_name__ = "after" value: dt.datetime def __post_init__(self) -> None: assert self.value.tzinfo is not None, "Datetimes must be timezone-aware"
[docs] @dataclass class AtOrAfter: __json_name__ = "at_or_after" value: dt.datetime def __post_init__(self) -> None: assert self.value.tzinfo is not None, "Datetimes must be timezone-aware"
[docs] @dataclass class Before: __json_name__ = "before" value: dt.datetime def __post_init__(self) -> None: assert self.value.tzinfo is not None, "Datetimes must be timezone-aware"
[docs] @dataclass class AtOrBefore: __json_name__ = "at_or_before" value: dt.datetime def __post_init__(self) -> None: assert self.value.tzinfo is not None, "Datetimes must be timezone-aware"
[docs] @dataclass class EqualTo: __json_name__ = "equal_to" value: str | int | bool
[docs] @dataclass class IsNull: __json_name__ = "is_null" value: Literal[True] = True
"""This condition matches all metadata fields with a value of null.""" FilterCondition = ( GreaterThan | GreaterThanOrEqualTo | LessThan | LessThanOrEqualTo | After | AtOrAfter | Before | AtOrBefore | EqualTo | IsNull )
[docs] @dataclass class MetadataFilter: """Matches sections whose metadata fields match the given condition. You must specify the field, and can only specify a single condition. While the Document Index also offers a `Modality` filter, we do not expose this to the developer. The reasoning is that we only allow for text modalities in the Kernel. So for each search request, we append a `Modality` filter that only allows for text modalities. Attributes: field (str): The metadata field on which to filter search results. Field names must only contain alphanumeric characters, dashes and underscores. Nested fields can be specified using dot notation (e.g. 'a.b'). Array-valued fields can either use a wildcard specifier (e.g. 'a[].b') or a specific index (e.g. 'a[1].b'). The maximum length of the field name is 1000 characters. condition (FilterCondition): The condition to filter on. """ field: str condition: FilterCondition
[docs] def serialize(self) -> dict[str, Any]: """How to serialize a metadata filter to a dictionary. It would be nice to specify this as a `model_serializer` and let pydantic handle the serialization. However, as we are already doing custom serialization on the outside, and this is not a Pydantic model we could call `.model_dump()` on, it seems to be the simplest solution to just implement the serialization manually. """ return { "metadata": { "field": self.field, self.condition.__json_name__: self.condition.value, } }
[docs] @dataclass class Without: """Logical conjunction of negations, i.e. forms the predicate "(NOT filterCondition1) AND (NOT filterCondition2) AND ..." Attributes: value (list[Filter]): The list of filter conditions. """ value: list[MetadataFilter]
[docs] def serialize(self) -> dict[str, list[Any]]: return {"without": [filter.serialize() for filter in self.value]}
[docs] @dataclass class WithOneOf: """Logical disjunction, i.e. forms the predicate "filterCondition1 OR filterCondition2 OR ..." Attributes: value (list[Filter]): The list of filter conditions. """ value: list[MetadataFilter]
[docs] def serialize(self) -> dict[str, list[Any]]: return {"with_one_of": [filter.serialize() for filter in self.value]}
[docs] @dataclass class With: """Logical conjunction, i.e. forms the predicate "filterCondition1 AND filterCondition2 AND ..." Attributes: value (list[Filter]): The list of filter conditions. """ value: list[MetadataFilter]
[docs] def serialize(self) -> dict[str, list[Any]]: return {"with": [filter.serialize() for filter in self.value]}
SearchFilter = Without | With | WithOneOf """A logical combination of filter conditions."""
[docs] @dataclass class SearchRequest: """A request to search the document index. Attributes: index_path (IndexPath): The index path to search in. query (str): The query to search for. max_results (int): Maximum number of results to return. Defaults to 1. min_score (float | None): Filter out results with a cosine similarity score below this value. Scores range from -1 to 1. For searches on hybrid indexes, the Document Index applies the min_score to the semantic results before fusion of result sets. As fusion re-scores results, returned scores may exceed this value. filters (list[SearchFilter]): A filter for search results that restricts the results to those document sections that match the filter criteria. The individual conditions of this array are AND-combined (i.e. all conditions must match). This can for example be used to restrict the returned sections based on their modality (i.e. image or text), or on their metadata. """ index_path: IndexPath query: str max_results: int = 1 min_score: float | None = None filters: list[SearchFilter] = field(default_factory=list)
[docs] @model_serializer() def serialize(self) -> dict[str, Any]: return { "index_path": asdict(self.index_path), "query": self.query, "max_results": self.max_results, "min_score": self.min_score, "filters": [filter.serialize() for filter in self.filters], }
[docs] @dataclass class Cursor: """A position within a document. The cursor is always inclusive of the current position, in both start and end positions. Attributes: item (int): Index of the item in the document. A document is an array of text and image elements. These elements are referred to as items. position (int): The character position the cursor can be found at within the string. """ item: int position: int
[docs] @dataclass class SearchResult: """The relevant documents as result of a search request. Attributes: document_path (DocumentPath): The path to a document. A path uniquely identifies a document among all managed documents. content (str): The text of the found section. As we do not support multi-modal, this is always a string. score (float): Search score of the found section, where a higher score indicates a closer match. Will be between -1 and 1. A score closer to -1 indicates the section opposes the query. A score close 0 suggests the section is unrelated to the query. A score close to 1 suggests the section is related to the query. start (Cursor): Where the result starts in the document. end (Cursor): Where the result ends in the document. """ document_path: DocumentPath content: str score: float start: Cursor end: Cursor
[docs] @dataclass class Text: """A text section that is part of a document. If the document only contains text, then the contents of the document is a list of length one, where the only element is a `Text`.""" text: str modality: Literal["text"] = "text"
[docs] @dataclass class Image: """An image that is part of a document. At the moment, we do not expose the image contents, as none of the models support multi-modal inputs. We still inform the developer that the document contains an image. """ modality: Literal["image"] = "image"
Modality = Text | Image """A document is made up of subsections of different modalities. For example, if a document is a long text with an image in the middle, then it will be represented by a list of length three, where the first and last elements are `Text` and the middle element is `Image`. """ JsonSerializable = dict[str, Any] | list[Any] | str | int | float | bool | None """Represent any value that can be serialized/deserialized to/from JSON. Used to represent the return type of `document_metadata` which is any valid JSON value. """
[docs] @dataclass class Document: """A document in the Document Index. Attributes: path (DocumentPath): The path that identifies the document. contents (list[Modality]): The contents of the document. Split into sections of different modalities. metadata (JsonSerializable): The (custom) metadata of the document. """ path: DocumentPath contents: list[Modality] metadata: JsonSerializable @property def text(self) -> str: """Concatenate the text contents of the document.""" return "\n\n".join( text.text for text in self.contents if isinstance(text, Text) )