Skip to content

Commit 972e79f

Browse files
authored
Meeting Transcription Notebook (#439)
A new sample notebook requested internally for meeting transcription. Super simple, uses Hugging Face, Whisper and pinecone's integrated inference for semantic search. --- - To see the specific tasks where the Asana app for GitHub is being used, see below: - https://app.asana.com/0/0/1210038526369577
1 parent fa35380 commit 972e79f

File tree

1 file changed

+284
-0
lines changed

1 file changed

+284
-0
lines changed
Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"vscode": {
7+
"languageId": "bat"
8+
}
9+
},
10+
"source": [
11+
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/search/meeting-transcription-search/meeting_transcription_semantic_search.ipynb)"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"## Semantic Search over your Meeting audio data\n",
19+
"\n",
20+
"This notebook demonstrates how to quickly enable semantic search given a single audio file with Pinecone and Hugging Face. Don't have one handy? No problem, use\n",
21+
"the sample audio instead."
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": 45,
27+
"metadata": {},
28+
"outputs": [
29+
{
30+
"name": "stdout",
31+
"output_type": "stream",
32+
"text": [
33+
"Requirement already satisfied: datasets in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (3.1.0)\n",
34+
"Requirement already satisfied: transformers in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (4.37.2)\n",
35+
"Requirement already satisfied: pinecone in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (6.0.2)\n",
36+
"Requirement already satisfied: filelock in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (3.13.1)\n",
37+
"Requirement already satisfied: numpy>=1.17 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (1.26.4)\n",
38+
"Requirement already satisfied: pyarrow>=15.0.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (18.0.0)\n",
39+
"Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (0.3.8)\n",
40+
"Requirement already satisfied: pandas in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (2.2.2)\n",
41+
"Requirement already satisfied: requests>=2.32.2 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (2.32.3)\n",
42+
"Requirement already satisfied: tqdm>=4.66.3 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (4.67.1)\n",
43+
"Requirement already satisfied: xxhash in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (3.5.0)\n",
44+
"Requirement already satisfied: multiprocess<0.70.17 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (0.70.16)\n",
45+
"Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.3.1)\n",
46+
"Requirement already satisfied: aiohttp in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (3.11.7)\n",
47+
"Requirement already satisfied: huggingface-hub>=0.23.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (0.26.2)\n",
48+
"Requirement already satisfied: packaging in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (23.2)\n",
49+
"Requirement already satisfied: pyyaml>=5.1 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from datasets) (6.0.1)\n",
50+
"Requirement already satisfied: regex!=2019.12.17 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from transformers) (2023.10.3)\n",
51+
"Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from transformers) (0.15.1)\n",
52+
"Requirement already satisfied: safetensors>=0.4.1 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from transformers) (0.4.2)\n",
53+
"Requirement already satisfied: certifi>=2019.11.17 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pinecone) (2024.2.2)\n",
54+
"Requirement already satisfied: pinecone-plugin-interface<0.0.8,>=0.0.7 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pinecone) (0.0.7)\n",
55+
"Requirement already satisfied: python-dateutil>=2.5.3 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pinecone) (2.9.0)\n",
56+
"Requirement already satisfied: typing-extensions>=3.7.4 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pinecone) (4.12.2)\n",
57+
"Requirement already satisfied: urllib3>=1.26.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pinecone) (2.1.0)\n",
58+
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (2.4.3)\n",
59+
"Requirement already satisfied: aiosignal>=1.1.2 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (1.3.1)\n",
60+
"Requirement already satisfied: attrs>=17.3.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (24.2.0)\n",
61+
"Requirement already satisfied: frozenlist>=1.1.1 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (1.5.0)\n",
62+
"Requirement already satisfied: multidict<7.0,>=4.5 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (6.1.0)\n",
63+
"Requirement already satisfied: propcache>=0.2.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (0.2.0)\n",
64+
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from aiohttp->datasets) (1.18.0)\n",
65+
"Requirement already satisfied: six>=1.5 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from python-dateutil>=2.5.3->pinecone) (1.16.0)\n",
66+
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from requests>=2.32.2->datasets) (2.0.4)\n",
67+
"Requirement already satisfied: idna<4,>=2.5 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from requests>=2.32.2->datasets) (3.7)\n",
68+
"Requirement already satisfied: pytz>=2020.1 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pandas->datasets) (2024.1)\n",
69+
"Requirement already satisfied: tzdata>=2022.7 in /opt/miniconda3/envs/pinecone-examples/lib/python3.11/site-packages (from pandas->datasets) (2024.1)\n"
70+
]
71+
}
72+
],
73+
"source": [
74+
"## Installs\n",
75+
"!pip install datasets transformers pinecone"
76+
]
77+
},
78+
{
79+
"cell_type": "code",
80+
"execution_count": 57,
81+
"metadata": {},
82+
"outputs": [],
83+
"source": [
84+
"# Grab your desired audio file compatible with Hugging Face Pipelines and put it here\n",
85+
"from getpass import getpass\n",
86+
"import os \n",
87+
"audio_path = \"\"\n",
88+
"transcription_result = []\n",
89+
"\n",
90+
"api_key = os.environ.get('PINECONE_API_KEY')"
91+
]
92+
},
93+
{
94+
"cell_type": "markdown",
95+
"metadata": {},
96+
"source": [
97+
"## Create a dataset or upload your own file"
98+
]
99+
},
100+
{
101+
"cell_type": "code",
102+
"execution_count": 53,
103+
"metadata": {},
104+
"outputs": [
105+
{
106+
"name": "stderr",
107+
"output_type": "stream",
108+
"text": [
109+
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
110+
]
111+
}
112+
],
113+
"source": [
114+
"from datasets import load_dataset\n",
115+
"from transformers import pipeline\n",
116+
"\n",
117+
"pipeline = pipeline(\n",
118+
" task=\"automatic-speech-recognition\",\n",
119+
" model=\"openai/whisper-large-v3\",\n",
120+
")\n",
121+
"\n",
122+
"\n",
123+
"if audio_path == \"\":\n",
124+
" # use Hugging Face Sample Code instead, located here https://huggingface.co/learn/audio-course/en/chapter7/transcribe-meeting\n",
125+
" concatenated_librispeech = load_dataset(\n",
126+
" \"sanchit-gandhi/concatenated_librispeech\", split=\"train\")\n",
127+
" transcription_result = pipeline(concatenated_librispeech[0][\"audio\"][\"array\"], return_timestamps=True)\n",
128+
" transcription_result\n",
129+
"else:\n",
130+
" # Use your own audio file, check out this for details: https://huggingface.co/openai/whisper-large-v3\n",
131+
" transcription_result = pipeline(audio_path, return_timestamps=True)\n"
132+
]
133+
},
134+
{
135+
"cell_type": "code",
136+
"execution_count": 54,
137+
"metadata": {},
138+
"outputs": [
139+
{
140+
"name": "stdout",
141+
"output_type": "stream",
142+
"text": [
143+
"[{'timestamp': (0.0, 15.1), 'text': ' the second in importance is as follows sovereignty may be defined to be the right of making laws in france the king really exercises a portion of the sovereign power since the laws have no weight'}, {'timestamp': (15.1, 21.72), 'text': \" he was in a fevered state of mind owing to the blight his wife's action threatened to cast upon his entire future\"}]\n"
144+
]
145+
}
146+
],
147+
"source": [
148+
"print(transcription_result[\"chunks\"])"
149+
]
150+
},
151+
{
152+
"cell_type": "markdown",
153+
"metadata": {},
154+
"source": [
155+
"## Convert into records and upsert with Integrated Inference\n",
156+
"\n",
157+
"\n",
158+
"Integrated Inference from Pinecone lets you embed your records with a hosted embedding model, and upsert them into\n",
159+
"a Pinecone index at the same time! We've included some batching code in case your audio file is long."
160+
]
161+
},
162+
{
163+
"cell_type": "code",
164+
"execution_count": 55,
165+
"metadata": {},
166+
"outputs": [],
167+
"source": [
168+
"## use sentences as chunks, and transform into records for upsertion\n",
169+
"\n",
170+
"# Turn into records\n",
171+
"records = [\n",
172+
" {\n",
173+
" \"_id\": str(idx),\n",
174+
" \"sentence\": chunk[\"text\"],\n",
175+
" # add any other desired metadata here\n",
176+
" }\n",
177+
" for idx, chunk in enumerate(transcription_result[\"chunks\"])\n",
178+
"]\n",
179+
"\n",
180+
"# Import the Pinecone library\n",
181+
"from pinecone import Pinecone\n",
182+
"\n",
183+
"# Initialize a Pinecone client with your API key\n",
184+
"pc = Pinecone(api_key=api_key)\n",
185+
"namespace = \"meeting-1\"\n",
186+
"# Create a dense index with integrated embedding\n",
187+
"index_name = \"meeting-transcription-index\"\n",
188+
"if not pc.has_index(index_name):\n",
189+
" pc.create_index_for_model(\n",
190+
" name=index_name,\n",
191+
" cloud=\"aws\",\n",
192+
" region=\"us-east-1\",\n",
193+
" embed={\n",
194+
" \"model\":\"llama-text-embed-v2\",\n",
195+
" \"field_map\":{\"text\": \"sentence\"}\n",
196+
" }\n",
197+
" )\n",
198+
"\n",
199+
"index = pc.Index(index_name)\n",
200+
"# query."
201+
]
202+
},
203+
{
204+
"cell_type": "code",
205+
"execution_count": 56,
206+
"metadata": {},
207+
"outputs": [],
208+
"source": [
209+
"# upsert into pinecone\n",
210+
"def batch_upsert(records, batch_size=96, namespace=namespace):\n",
211+
" # Great for longer audio files and batches of sentences\n",
212+
" for i in range(0, len(records), batch_size):\n",
213+
" batch = records[i:i+batch_size]\n",
214+
" index.upsert_records(namespace=namespace, records=batch)\n",
215+
"\n",
216+
"batch_upsert(records)"
217+
]
218+
},
219+
{
220+
"cell_type": "markdown",
221+
"metadata": {},
222+
"source": [
223+
"## Query the index with integrated inference"
224+
]
225+
},
226+
{
227+
"cell_type": "code",
228+
"execution_count": null,
229+
"metadata": {},
230+
"outputs": [],
231+
"source": [
232+
"# Replace with your own query here if needed\n",
233+
"import time\n",
234+
"query = \"Tell me about the king of France\"\n",
235+
"\n",
236+
"# Depending on the size of your dataset, it may take a few seconds for it to finish\n",
237+
"# embedding and populating into the index.\n",
238+
"time.sleep(10)\n",
239+
"\n",
240+
"results = index.search(\n",
241+
" namespace=namespace,\n",
242+
" query={\n",
243+
" \"inputs\": {\"text\": query},\n",
244+
" \"top_k\": 5,\n",
245+
" },\n",
246+
")\n",
247+
"\n",
248+
"print(results)"
249+
]
250+
},
251+
{
252+
"cell_type": "code",
253+
"execution_count": null,
254+
"metadata": {},
255+
"outputs": [],
256+
"source": [
257+
"# Cleanup\n",
258+
"\n",
259+
"#pc.delete_index(name=index_name)"
260+
]
261+
}
262+
],
263+
"metadata": {
264+
"kernelspec": {
265+
"display_name": "pinecone-examples",
266+
"language": "python",
267+
"name": "python3"
268+
},
269+
"language_info": {
270+
"codemirror_mode": {
271+
"name": "ipython",
272+
"version": 3
273+
},
274+
"file_extension": ".py",
275+
"mimetype": "text/x-python",
276+
"name": "python",
277+
"nbconvert_exporter": "python",
278+
"pygments_lexer": "ipython3",
279+
"version": "3.11.9"
280+
}
281+
},
282+
"nbformat": 4,
283+
"nbformat_minor": 2
284+
}

0 commit comments

Comments
 (0)