-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathapp.py
More file actions
217 lines (187 loc) · 9.1 KB
/
app.py
File metadata and controls
217 lines (187 loc) · 9.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
"""This script demonstrates various methods of text splitting
and how to use Chroma for vectorization of text chunks,
and RAG for question-answering based on the embeddings.
The code is derived from https://mer.vin/2024/03/chunking-strategy/,
with deprecated code commented out and updated.
"""
import logging
from pathlib import Path
from typing import Annotated, Any
from dotenv import load_dotenv
from langchain.docstore.document import Document
# from langchain_community.chat_models import ChatOllama # DEPRECATED
# from langchain_community import embeddings
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import Runnable, RunnablePassthrough, RunnableSerializable
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import CharacterTextSplitter, Language, MarkdownTextSplitter, PythonCodeTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from rich import print as rprint
from agentic_chunker import AgenticChunker
class Texts:
DEFAULT_TEXT_PATH = "content.txt"
XCARET_TEXT_PATH = "xcaret_content.txt"
DEFAULT_TEXT_SPLITTING_TEXT_STRING = """Text splitting is the process of dividing a text into smaller parts. It is used to break down a large text into smaller parts to make it easier to read and understand. Text splitting is commonly used in natural language processing to process large amounts of text data."""
JS_CODE_TEXT_STRING = """
let x = myFunction(4, 3);
function myFunction(a, b) {
return a * b;
}
"""
MARKDOWN_TEXT = """
# Fun in California
## Driving
Try driving on the 1 down to San Diego
## Food
Make sure to eat a burrito while you're there
## Hiking
Go to Yosemite
"""
PYTHON_CODE_TEXT_STRING = """
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
p1 = Person("John", 36)
for i in range(10):
print(i)
"""
def __init__(self) -> None:
self.texts_dict = {
"ds": Texts.DEFAULT_TEXT_SPLITTING_TEXT_STRING,
"dt": lambda: Path(self.DEFAULT_TEXT_PATH).read_text(encoding="utf-8"),
"js": Texts.JS_CODE_TEXT_STRING,
"md": Texts.MARKDOWN_TEXT,
"py": Texts.PYTHON_CODE_TEXT_STRING,
"xc": lambda: Path(self.XCARET_TEXT_PATH).read_text(encoding="utf-8"),
}
def __getitem__(self, k: str) -> str | object | None:
v = self.texts_dict.get(k)
item = v() if callable(v) else v
if item is None:
logging.warning(f"No text found for key: {k}")
item = "I'm not sure"
return item
def rag(chunks: list[Document], collection_name: str, local_llm: ChatOllama, question: str) -> str:
# embedding = embeddings.OllamaEmbeddings(model="nomic-embed-text") # DEPRECATED
embedding = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma.from_documents(
documents=chunks,
collection_name=collection_name,
embedding=embedding
)
retriever = vectorstore.as_retriever()
prompt_template = """Answer the question below, taking into account the context provided here:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(prompt_template)
chain: RunnableSerializable = (
{"context": retriever, "question": RunnablePassthrough()}
| prompt
| local_llm
| StrOutputParser()
)
result = chain.invoke(question)
return str(result)
class TextSplitter:
def __init__(self) -> None:
pass
def manual_character_split_text(self, text: str, chunk_size: int = 35) -> list[Annotated[Document, "external"]]:
rprint("#### Manual Character Text Splitting ####")
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
documents: list[Document] = self._chunks_to_documents(chunks)
return documents
def automatic_character_split_text(self, text: str, chunk_size: int = 35) -> list[Annotated[Document, "external"]]:
rprint("#### Automatic Character Text Splitting ####")
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, separator=" ", strip_whitespace=False)
documents: list[Document] = text_splitter.create_documents([text])
return documents
def recursive_character_split_text(self, text: str, chunk_size: int = 65) -> list[Annotated[Document, "external"]]:
rprint("#### Recursive Character Text Splitting ####")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
documents: list[Document] = text_splitter.create_documents([text])
return documents
def markdown_split_text(self, text: str, chunk_size: int = 40) -> list[Annotated[Document, "external"]]:
rprint("#### Markdown Text Splitting ####")
text_splitter = MarkdownTextSplitter(chunk_size = chunk_size, chunk_overlap=0)
documents: list[Document] = text_splitter.create_documents([text])
return documents
def python_code_split_text(self, text: str, chunk_size: int = 100) -> list[Annotated[Document, "external"]]:
rprint("#### Python Code Text Splitting ####")
text_splitter = PythonCodeTextSplitter(chunk_size = chunk_size, chunk_overlap=0)
documents: list[Document] = text_splitter.create_documents([text])
return documents
def js_code_split_text(self, text: str, chunk_size: int = 65) -> list[Annotated[Document, "external"]]:
rprint("#### JavaScript Code Text Splitting ####")
text_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.JS, chunk_size=chunk_size, chunk_overlap=0
)
documents: list[Document] = text_splitter.create_documents([text])
return documents
def semantic_split_text(self, text: str) -> list[Annotated[Document, "external"]]:
rprint("#### Semantic Text Splitting ####")
text_splitter = SemanticChunker(
OpenAIEmbeddings(), breakpoint_threshold_type="percentile" # "standard_deviation", "interquartile"
)
documents: list[Document] = text_splitter.create_documents([text])
return documents
def agentic_split_text(self, text: str) -> list[Annotated[Document, "external"]]:
rprint("#### Agentic Text Splitting ####")
self.text = text
ac = AgenticChunker()
propositions = self._split_text_into_sentences(self.text)
ac.add_propositions(propositions)
chunks = ac.get_chunks(get_type="list_of_strings")
if not isinstance(chunks, list):
logging.warning("No chunks found")
chunks = []
chunks = [str(chunk) for chunk in chunks]
documents: list[Document] = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]
return documents
def _split_text_into_sentences(self, text: str) -> list[str]:
return [f"{sentence.strip()}." for sentence in text.split(".")]
def _chunks_to_documents(self, chunks: list[str]) -> list[Annotated[Document, "external"]]:
documents: list[Document] = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]
return documents
if __name__ == "__main__":
# GET ENVIRONMENT VARIABLES FROM .env FILE -
# REQUIRED FOR `semantic_split_text` AND `agentic_split_text`
load_dotenv()
# GET TEXT
texts = Texts()
text = texts["dt"] # SELECT TEXT
if not isinstance(text, str):
raise ValueError("ERROR: Text is not a string!")
# SPLIT TEXT
text_splitter = TextSplitter()
# documents = text_splitter.manual_character_split_text(text)
# documents = text_splitter.automatic_character_split_text(text)
# documents = text_splitter.recursive_character_split_text(text)
# documents = text_splitter.markdown_split_text(text)
# documents = text_splitter.python_code_split_text(text)
# documents = text_splitter.js_code_split_text(text)
# documents = text_splitter.semantic_split_text(text)
documents = text_splitter.agentic_split_text(text)
rprint(documents)
# PERFORM RAG
local_llm = ChatOllama(model="mistral", temperature=0.0, num_predict=256)
QUESTION = "What is the use of Text Splitting?"
# QUESTION = "What are the attributes of Person class?"
# QUESTION = "Is myFunction valid python?"
# QUESTION = "Where should I go for hiking in California?"
# READ USER INPUT LOOP
# NOTE: Conversation is single-turn
try:
while True:
question = input(f"Enter your question (Ctrl-D to exit) (leave blank for default question): ")
if not question:
question = QUESTION
rprint(f"Question: {question}")
rag_response = rag(documents, "text_splitting", local_llm, question)
rprint(rag_response)
except EOFError:
rprint("\nExiting...")