import datetime as dt
from dataclasses import asdict, field
from typing import Any, Literal
from pydantic import model_serializer
# We use pydantic.dataclasses to get type validation.
# See the docstring of `csi` module for more information on the why.
from pydantic.dataclasses import dataclass
[docs]
@dataclass(frozen=True)
class DocumentPath:
"""Path identifying a document.
A DocumentPath consists of a namespace, within the namespace a collection and within the collection a document has a name.
A user might want to filter for unique documents. By making `DocumentPath` a frozen dataclass,
we ensure that it is hashable and a user can use a set to filter for unique ones before
requesting the documents.
Attributes:
namespace (str): The namespace.
collection (str): The collection within the namespace.
name (str): The name identifying the document in the collection.
"""
namespace: str
collection: str
name: str
[docs]
@dataclass
class IndexPath:
"""Which documents you want to search in, and which type of index should be used.
Attributes:
namespace (string): The namespace the collection belongs to.
collection (string): The collection you want to search in.
index (str): The search index you want to use for the collection.
"""
namespace: str
collection: str
index: str
[docs]
@dataclass
class GreaterThan:
__json_name__ = "greater_than"
value: float
[docs]
@dataclass
class GreaterThanOrEqualTo:
__json_name__ = "greater_than_or_equal_to"
value: float
[docs]
@dataclass
class LessThan:
__json_name__ = "less_than"
value: float
[docs]
@dataclass
class LessThanOrEqualTo:
__json_name__ = "less_than_or_equal_to"
value: float
[docs]
@dataclass
class After:
__json_name__ = "after"
value: dt.datetime
def __post_init__(self) -> None:
assert self.value.tzinfo is not None, "Datetimes must be timezone-aware"
[docs]
@dataclass
class AtOrAfter:
__json_name__ = "at_or_after"
value: dt.datetime
def __post_init__(self) -> None:
assert self.value.tzinfo is not None, "Datetimes must be timezone-aware"
[docs]
@dataclass
class Before:
__json_name__ = "before"
value: dt.datetime
def __post_init__(self) -> None:
assert self.value.tzinfo is not None, "Datetimes must be timezone-aware"
[docs]
@dataclass
class AtOrBefore:
__json_name__ = "at_or_before"
value: dt.datetime
def __post_init__(self) -> None:
assert self.value.tzinfo is not None, "Datetimes must be timezone-aware"
[docs]
@dataclass
class EqualTo:
__json_name__ = "equal_to"
value: str | int | bool
[docs]
@dataclass
class IsNull:
__json_name__ = "is_null"
value: Literal[True] = True
"""This condition matches all metadata fields with a value of null."""
FilterCondition = (
GreaterThan
| GreaterThanOrEqualTo
| LessThan
| LessThanOrEqualTo
| After
| AtOrAfter
| Before
| AtOrBefore
| EqualTo
| IsNull
)
[docs]
@dataclass
class Without:
"""Logical conjunction of negations, i.e. forms the predicate "(NOT filterCondition1) AND (NOT filterCondition2) AND ..."
Attributes:
value (list[Filter]): The list of filter conditions.
"""
value: list[MetadataFilter]
[docs]
def serialize(self) -> dict[str, list[Any]]:
return {"without": [filter.serialize() for filter in self.value]}
[docs]
@dataclass
class WithOneOf:
"""Logical disjunction, i.e. forms the predicate "filterCondition1 OR filterCondition2 OR ..."
Attributes:
value (list[Filter]): The list of filter conditions.
"""
value: list[MetadataFilter]
[docs]
def serialize(self) -> dict[str, list[Any]]:
return {"with_one_of": [filter.serialize() for filter in self.value]}
[docs]
@dataclass
class With:
"""Logical conjunction, i.e. forms the predicate "filterCondition1 AND filterCondition2 AND ..."
Attributes:
value (list[Filter]): The list of filter conditions.
"""
value: list[MetadataFilter]
[docs]
def serialize(self) -> dict[str, list[Any]]:
return {"with": [filter.serialize() for filter in self.value]}
SearchFilter = Without | With | WithOneOf
"""A logical combination of filter conditions."""
[docs]
@dataclass
class SearchRequest:
"""A request to search the document index.
Attributes:
index_path (IndexPath): The index path to search in.
query (str): The query to search for.
max_results (int): Maximum number of results to return. Defaults to 1.
min_score (float | None): Filter out results with a cosine similarity score below this value.
Scores range from -1 to 1. For searches on hybrid indexes, the Document Index applies the min_score to the semantic results before fusion of result sets.
As fusion re-scores results, returned scores may exceed this value.
filters (list[SearchFilter]): A filter for search results that restricts the results to those document sections that match the filter criteria.
The individual conditions of this array are AND-combined (i.e. all conditions must match).
This can for example be used to restrict the returned sections based on their modality (i.e. image or text), or on their metadata.
"""
index_path: IndexPath
query: str
max_results: int = 1
min_score: float | None = None
filters: list[SearchFilter] = field(default_factory=list)
[docs]
@model_serializer()
def serialize(self) -> dict[str, Any]:
return {
"index_path": asdict(self.index_path),
"query": self.query,
"max_results": self.max_results,
"min_score": self.min_score,
"filters": [filter.serialize() for filter in self.filters],
}
[docs]
@dataclass
class Cursor:
"""A position within a document.
The cursor is always inclusive of the current position, in both start and end positions.
Attributes:
item (int): Index of the item in the document. A document is an array of text and image elements. These elements are referred to as items.
position (int): The character position the cursor can be found at within the string.
"""
item: int
position: int
[docs]
@dataclass
class SearchResult:
"""The relevant documents as result of a search request.
Attributes:
document_path (DocumentPath): The path to a document. A path uniquely identifies a document among all managed documents.
content (str): The text of the found section. As we do not support multi-modal, this is always a string.
score (float): Search score of the found section, where a higher score indicates a closer match.
Will be between -1 and 1. A score closer to -1 indicates the section opposes the query.
A score close 0 suggests the section is unrelated to the query.
A score close to 1 suggests the section is related to the query.
start (Cursor): Where the result starts in the document.
end (Cursor): Where the result ends in the document.
"""
document_path: DocumentPath
content: str
score: float
start: Cursor
end: Cursor
[docs]
@dataclass
class Text:
"""A text section that is part of a document.
If the document only contains text, then the contents of the document is a list of
length one, where the only element is a `Text`."""
text: str
modality: Literal["text"] = "text"
[docs]
@dataclass
class Image:
"""An image that is part of a document.
At the moment, we do not expose the image contents, as none of the models
support multi-modal inputs. We still inform the developer that the document
contains an image.
"""
modality: Literal["image"] = "image"
Modality = Text | Image
"""A document is made up of subsections of different modalities.
For example, if a document is a long text with an image in the middle, then
it will be represented by a list of length three, where the first and last
elements are `Text` and the middle element is `Image`.
"""
JsonSerializable = dict[str, Any] | list[Any] | str | int | float | bool | None
"""Represent any value that can be serialized/deserialized to/from JSON.
Used to represent the return type of `document_metadata` which is any valid JSON value.
"""
[docs]
@dataclass
class Document:
"""A document in the Document Index.
Attributes:
path (DocumentPath): The path that identifies the document.
contents (list[Modality]): The contents of the document. Split into sections of different modalities.
metadata (JsonSerializable): The (custom) metadata of the document.
"""
path: DocumentPath
contents: list[Modality]
metadata: JsonSerializable
@property
def text(self) -> str:
"""Concatenate the text contents of the document."""
return "\n\n".join(
text.text for text in self.contents if isinstance(text, Text)
)