agentic-chunking/app.py at main · hesreallyhim/agentic-chunking · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
"""This script demonstrates various methods of text splitting
and how to use Chroma for vectorization of text chunks,
and RAG for question-answering based on the embeddings.
The code is derived from https://mer.vin/2024/03/chunking-strategy/,
with deprecated code commented out and updated.
"""

import logging
from pathlib import Path
from typing import Annotated, Any

from dotenv import load_dotenv
from langchain.docstore.document import Document
# from langchain_community.chat_models import ChatOllama # DEPRECATED
# from langchain_community import embeddings
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import Runnable, RunnablePassthrough, RunnableSerializable
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import CharacterTextSplitter, Language, MarkdownTextSplitter, PythonCodeTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from rich import print as rprint

from agentic_chunker import AgenticChunker

class Texts:
    DEFAULT_TEXT_PATH = "content.txt"
    XCARET_TEXT_PATH = "xcaret_content.txt"
    DEFAULT_TEXT_SPLITTING_TEXT_STRING = """Text splitting is the process of dividing a text into smaller parts. It is used to break down a large text into smaller parts to make it easier to read and understand. Text splitting is commonly used in natural language processing to process large amounts of text data."""
    JS_CODE_TEXT_STRING = """
    let x = myFunction(4, 3);

    function myFunction(a, b) {
        return a * b;
    }
    """
    MARKDOWN_TEXT = """
    # Fun in California

    ## Driving

    Try driving on the 1 down to San Diego

    ## Food

    Make sure to eat a burrito while you're there

    ## Hiking

    Go to Yosemite
    """
    PYTHON_CODE_TEXT_STRING = """
    class Person:
        def __init__(self, name, age):
            self.name = name
            self.age = age

    p1 = Person("John", 36)

    for i in range(10):
        print(i)
    """

    def __init__(self) -> None:
        self.texts_dict = {
            "ds": Texts.DEFAULT_TEXT_SPLITTING_TEXT_STRING,
            "dt": lambda: Path(self.DEFAULT_TEXT_PATH).read_text(encoding="utf-8"),
            "js": Texts.JS_CODE_TEXT_STRING,
            "md": Texts.MARKDOWN_TEXT,
            "py": Texts.PYTHON_CODE_TEXT_STRING,
            "xc": lambda: Path(self.XCARET_TEXT_PATH).read_text(encoding="utf-8"),
        }

    def __getitem__(self, k: str) -> str | object | None:
        v = self.texts_dict.get(k)
        item = v() if callable(v) else v
        if item is None:
            logging.warning(f"No text found for key: {k}")
            item = "I'm not sure"
        return item

def rag(chunks: list[Document], collection_name: str, local_llm: ChatOllama, question: str) -> str:
    # embedding = embeddings.OllamaEmbeddings(model="nomic-embed-text") # DEPRECATED
    embedding = OllamaEmbeddings(model="nomic-embed-text")
    vectorstore = Chroma.from_documents(
        documents=chunks,
        collection_name=collection_name,
        embedding=embedding
    )
    retriever = vectorstore.as_retriever()

    prompt_template = """Answer the question below, taking into account the context provided here:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(prompt_template)

    chain: RunnableSerializable = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | local_llm
        | StrOutputParser()
    )
    result = chain.invoke(question)
    return str(result)

class TextSplitter:
    def __init__(self) -> None:
        pass

    def manual_character_split_text(self, text: str, chunk_size: int = 35) -> list[Annotated[Document, "external"]]:
        rprint("#### Manual Character Text Splitting ####")
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        documents: list[Document] = self._chunks_to_documents(chunks)
        return documents

    def automatic_character_split_text(self, text: str, chunk_size: int = 35) -> list[Annotated[Document, "external"]]:
        rprint("#### Automatic Character Text Splitting ####")
        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, separator=" ", strip_whitespace=False)
        documents: list[Document] = text_splitter.create_documents([text])
        return documents

    def recursive_character_split_text(self, text: str, chunk_size: int = 65) -> list[Annotated[Document, "external"]]:
        rprint("#### Recursive Character Text Splitting ####")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
        documents: list[Document] = text_splitter.create_documents([text])
        return documents

    def markdown_split_text(self, text: str, chunk_size: int = 40) -> list[Annotated[Document, "external"]]:
        rprint("#### Markdown Text Splitting ####")
        text_splitter = MarkdownTextSplitter(chunk_size = chunk_size, chunk_overlap=0)
        documents: list[Document] = text_splitter.create_documents([text])
        return documents

    def python_code_split_text(self, text: str, chunk_size: int = 100) -> list[Annotated[Document, "external"]]:
        rprint("#### Python Code Text Splitting ####")
        text_splitter = PythonCodeTextSplitter(chunk_size = chunk_size, chunk_overlap=0)
        documents: list[Document] = text_splitter.create_documents([text])
        return documents

    def js_code_split_text(self, text: str, chunk_size: int = 65) -> list[Annotated[Document, "external"]]:
        rprint("#### JavaScript Code Text Splitting ####")
        text_splitter = RecursiveCharacterTextSplitter.from_language(
            language=Language.JS, chunk_size=chunk_size, chunk_overlap=0
        )
        documents: list[Document] = text_splitter.create_documents([text])
        return documents

    def semantic_split_text(self, text: str) -> list[Annotated[Document, "external"]]:
        rprint("#### Semantic Text Splitting ####")
        text_splitter = SemanticChunker(
            OpenAIEmbeddings(), breakpoint_threshold_type="percentile" # "standard_deviation", "interquartile"
        )
        documents: list[Document] = text_splitter.create_documents([text])
        return documents

    def agentic_split_text(self, text: str) -> list[Annotated[Document, "external"]]:
        rprint("#### Agentic Text Splitting ####")
        self.text = text
        ac = AgenticChunker()
        propositions = self._split_text_into_sentences(self.text)
        ac.add_propositions(propositions)
        chunks = ac.get_chunks(get_type="list_of_strings")
        if not isinstance(chunks, list):
            logging.warning("No chunks found")
            chunks = []
        chunks = [str(chunk) for chunk in chunks]
        documents: list[Document] = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]
        return documents

    def _split_text_into_sentences(self, text: str) -> list[str]:
        return [f"{sentence.strip()}." for sentence in text.split(".")]

    def _chunks_to_documents(self, chunks: list[str]) -> list[Annotated[Document, "external"]]:
        documents: list[Document] = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]
        return documents

if __name__ == "__main__":
    # GET ENVIRONMENT VARIABLES FROM .env FILE -
    # REQUIRED FOR `semantic_split_text` AND `agentic_split_text`
    load_dotenv()
    # GET TEXT
    texts = Texts()
    text = texts["dt"] # SELECT TEXT
    if not isinstance(text, str):
        raise ValueError("ERROR: Text is not a string!")
    # SPLIT TEXT
    text_splitter = TextSplitter()
    # documents = text_splitter.manual_character_split_text(text)
    # documents = text_splitter.automatic_character_split_text(text)
    # documents = text_splitter.recursive_character_split_text(text)
    # documents = text_splitter.markdown_split_text(text)
    # documents = text_splitter.python_code_split_text(text)
    # documents = text_splitter.js_code_split_text(text)
    # documents = text_splitter.semantic_split_text(text)
    documents = text_splitter.agentic_split_text(text)
    rprint(documents)
    # PERFORM RAG
    local_llm = ChatOllama(model="mistral", temperature=0.0, num_predict=256)
    QUESTION = "What is the use of Text Splitting?"
    # QUESTION = "What are the attributes of Person class?"
    # QUESTION = "Is myFunction valid python?"
    # QUESTION = "Where should I go for hiking in California?"
    # READ USER INPUT LOOP
    # NOTE: Conversation is single-turn
    try:
        while True:
            question = input(f"Enter your question (Ctrl-D to exit) (leave blank for default question): ")
            if not question:
                question = QUESTION
                rprint(f"Question: {question}")
            rag_response = rag(documents, "text_splitting", local_llm, question)
            rprint(rag_response)
    except EOFError:
        rprint("\nExiting...")