Skip to content

Commit 959c257

Browse files
committed
examples: adapt to the new JSON format
Signed-off by: Yoan Salambashev <yoan.salambashev@broadcom.com>
1 parent e770e2c commit 959c257

File tree

4 files changed

+21
-19
lines changed

4 files changed

+21
-19
lines changed

examples/embed-ingest-job-example/20_clean_and_embed_json_data.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ def load_and_clean_documents(json_file_path):
4646
documents = json.load(file)
4747

4848
for doc in documents:
49-
if "page_content" in doc:
50-
cleaned_text = clean_text(doc["page_content"])
49+
if "data" in doc:
50+
cleaned_text = clean_text(doc["data"])
5151
cleaned_documents.append([cleaned_text])
5252

5353
print(len(cleaned_documents))

examples/embed-ingest-job-example/30_create_schema.sql

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ CREATE TABLE IF NOT EXISTS public.vdk_confluence_doc_metadata_example
1313
id INTEGER PRIMARY KEY,
1414
title TEXT,
1515
source TEXT,
16-
content TEXT,
16+
data TEXT,
17+
deleted BOOLEAN,
1718
CONSTRAINT fk_metadata_embeddings FOREIGN KEY (id) REFERENCES public.vdk_confluence_doc_embeddings_example(id)
1819
);

examples/embed-ingest-job-example/40_ingest_embeddings.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,9 @@ def run(job_input: IJobInput):
4444
metadata_payload = {
4545
"id": document["metadata"]["id"],
4646
"title": document["metadata"]["title"],
47-
"content": document["page_content"],
47+
"data": document["data"],
4848
"source": document["metadata"]["source"],
49+
"deleted": document["metadata"]["deleted"],
4950
}
5051
job_input.send_object_for_ingestion(
5152
payload=metadata_payload,

examples/embed-ingest-job-example/documents_example.json

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,45 +3,45 @@
33
"metadata": {
44
"title": "Getting Started",
55
"id": "123213312",
6-
"source": "https://github.com/vmware/versatile-data-kit/wiki/Getting-Started"
6+
"source": "https://github.com/vmware/versatile-data-kit/wiki/Getting-Started",
7+
"deleted": false
78
},
8-
"page_content": "VDK Getting Started guide",
9-
"deleted": false
9+
"data": "VDK Getting Started guide"
1010
},
1111
{
1212
"metadata": {
1313
"title": "VDK Wiki",
1414
"id": "747124724",
15-
"source": "https://github.com/vmware/versatile-data-kit/wiki"
15+
"source": "https://github.com/vmware/versatile-data-kit/wiki",
16+
"deleted": false
1617
},
17-
"page_content": "VDK Wiki",
18-
"deleted": false
18+
"data": "VDK Wiki"
1919
},
2020
{
2121
"metadata": {
2222
"title": "VDK Issues",
2323
"id": "721295269",
24-
"source": "https://github.com/vmware/versatile-data-kit/issues"
24+
"source": "https://github.com/vmware/versatile-data-kit/issues",
25+
"deleted": false
2526
},
26-
"page_content": "VDK Issues",
27-
"deleted": false
27+
"data": "VDK Issues"
2828
},
2929
{
3030
"metadata": {
3131
"title": "VDK PRs",
3232
"id": "1323122133",
33-
"source": "https://github.com/vmware/versatile-data-kit/pulls"
33+
"source": "https://github.com/vmware/versatile-data-kit/pulls",
34+
"deleted": false
3435
},
35-
"page_content": "VDK Pull Requests",
36-
"deleted": false
36+
"data": "VDK Pull Requests"
3737
},
3838
{
3939
"metadata": {
4040
"title": "VDK Main Page",
4141
"id": "312343243",
42-
"source": "https://github.com/vmware/versatile-data-kit/tree/main"
42+
"source": "https://github.com/vmware/versatile-data-kit/tree/main",
43+
"deleted": false
4344
},
44-
"page_content": "VDK: One framework to develop, deploy and operate data workflows with Python and SQL.",
45-
"deleted": false
45+
"data": "VDK: One framework to develop, deploy and operate data workflows with Python and SQL."
4646
}
4747
]

0 commit comments

Comments
 (0)