Skip to content

Commit fa3f908

Browse files
nikhilkuriacubic-dev-ai[bot]
authored andcommitted
feat(Default Data Loader Node): Add default text splitter (n8n-io#15786)
Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
1 parent c084872 commit fa3f908

File tree

4 files changed

+309
-27
lines changed

4 files changed

+309
-27
lines changed

packages/@n8n/nodes-langchain/nodes/document_loaders/DocumentDefaultDataLoader/DocumentDefaultDataLoader.node.ts

Lines changed: 70 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
2-
import type { TextSplitter } from '@langchain/textsplitters';
2+
import { RecursiveCharacterTextSplitter, type TextSplitter } from '@langchain/textsplitters';
33
import {
44
NodeConnectionTypes,
55
type INodeType,
66
type INodeTypeDescription,
77
type ISupplyDataFunctions,
88
type SupplyData,
9+
type IDataObject,
10+
type INodeInputConfiguration,
911
} from 'n8n-workflow';
1012

1113
import { logWrapper } from '@utils/logWrapper';
@@ -20,13 +22,31 @@ import 'mammoth'; // for docx
2022
import 'epub2'; // for epub
2123
import 'pdf-parse'; // for pdf
2224

25+
function getInputs(parameters: IDataObject) {
26+
const inputs: INodeInputConfiguration[] = [];
27+
28+
const textSplittingMode = parameters?.textSplittingMode;
29+
// If text splitting mode is 'custom' or does not exist (v1), we need to add an input for the text splitter
30+
if (!textSplittingMode || textSplittingMode === 'custom') {
31+
inputs.push({
32+
displayName: 'Text Splitter',
33+
maxConnections: 1,
34+
type: 'ai_textSplitter',
35+
required: true,
36+
});
37+
}
38+
39+
return inputs;
40+
}
41+
2342
export class DocumentDefaultDataLoader implements INodeType {
2443
description: INodeTypeDescription = {
2544
displayName: 'Default Data Loader',
2645
name: 'documentDefaultDataLoader',
2746
icon: 'file:binary.svg',
2847
group: ['transform'],
29-
version: 1,
48+
version: [1, 1.1],
49+
defaultVersion: 1.1,
3050
description: 'Load data from previous step in the workflow',
3151
defaults: {
3252
name: 'Default Data Loader',
@@ -45,14 +65,7 @@ export class DocumentDefaultDataLoader implements INodeType {
4565
},
4666
},
4767
// eslint-disable-next-line n8n-nodes-base/node-class-description-inputs-wrong-regular-node
48-
inputs: [
49-
{
50-
displayName: 'Text Splitter',
51-
maxConnections: 1,
52-
type: NodeConnectionTypes.AiTextSplitter,
53-
required: true,
54-
},
55-
],
68+
inputs: `={{ ((parameter) => { ${getInputs.toString()}; return getInputs(parameter) })($parameter) }}`,
5669
// eslint-disable-next-line n8n-nodes-base/node-class-description-outputs-wrong
5770
outputs: [NodeConnectionTypes.AiDocument],
5871
outputNames: ['Document'],
@@ -64,6 +77,31 @@ export class DocumentDefaultDataLoader implements INodeType {
6477
type: 'notice',
6578
default: '',
6679
},
80+
{
81+
displayName: 'Text Splitting',
82+
name: 'textSplittingMode',
83+
type: 'options',
84+
default: 'simple',
85+
required: true,
86+
noDataExpression: true,
87+
displayOptions: {
88+
show: {
89+
'@version': [1.1],
90+
},
91+
},
92+
options: [
93+
{
94+
name: 'Simple',
95+
value: 'simple',
96+
description: 'Uses the Recursive Character Text Splitter with default options',
97+
},
98+
{
99+
name: 'Custom',
100+
value: 'custom',
101+
description: 'Connect a text splitter of your choice',
102+
},
103+
],
104+
},
67105
{
68106
displayName: 'Type of Data',
69107
name: 'dataType',
@@ -284,11 +322,29 @@ export class DocumentDefaultDataLoader implements INodeType {
284322
};
285323

286324
async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
325+
const node = this.getNode();
287326
const dataType = this.getNodeParameter('dataType', itemIndex, 'json') as 'json' | 'binary';
288-
const textSplitter = (await this.getInputConnectionData(
289-
NodeConnectionTypes.AiTextSplitter,
290-
0,
291-
)) as TextSplitter | undefined;
327+
328+
let textSplitter: TextSplitter | undefined;
329+
330+
if (node.typeVersion === 1.1) {
331+
const textSplittingMode = this.getNodeParameter('textSplittingMode', itemIndex, 'simple') as
332+
| 'simple'
333+
| 'custom';
334+
335+
if (textSplittingMode === 'simple') {
336+
textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
337+
} else if (textSplittingMode === 'custom') {
338+
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
339+
| TextSplitter
340+
| undefined;
341+
}
342+
} else {
343+
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
344+
| TextSplitter
345+
| undefined;
346+
}
347+
292348
const binaryDataKey = this.getNodeParameter('binaryDataKey', itemIndex, '') as string;
293349

294350
const processor =
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
2+
import type { ISupplyDataFunctions } from 'n8n-workflow';
3+
import { NodeConnectionTypes } from 'n8n-workflow';
4+
5+
import { DocumentDefaultDataLoader } from '../DocumentDefaultDataLoader.node';
6+
7+
jest.mock('@langchain/textsplitters', () => ({
8+
RecursiveCharacterTextSplitter: jest.fn().mockImplementation(() => ({
9+
splitDocuments: jest.fn(
10+
async (docs: Array<Record<string, unknown>>): Promise<Array<Record<string, unknown>>> =>
11+
docs.map((doc) => ({ ...doc, split: true })),
12+
),
13+
})),
14+
}));
15+
16+
describe('DocumentDefaultDataLoader', () => {
17+
let loader: DocumentDefaultDataLoader;
18+
19+
beforeEach(() => {
20+
loader = new DocumentDefaultDataLoader();
21+
jest.clearAllMocks();
22+
});
23+
24+
it('should supply data with recursive char text splitter', async () => {
25+
const context = {
26+
getNode: jest.fn(() => ({ typeVersion: 1.1 })),
27+
getNodeParameter: jest.fn().mockImplementation((paramName, _itemIndex) => {
28+
switch (paramName) {
29+
case 'dataType':
30+
return 'json';
31+
case 'textSplittingMode':
32+
return 'simple';
33+
case 'binaryDataKey':
34+
return 'data';
35+
default:
36+
return;
37+
}
38+
}),
39+
} as unknown as ISupplyDataFunctions;
40+
41+
await loader.supplyData.call(context, 0);
42+
expect(RecursiveCharacterTextSplitter).toHaveBeenCalledWith({
43+
chunkSize: 1000,
44+
chunkOverlap: 200,
45+
});
46+
});
47+
48+
it('should supply data with custom text splitter', async () => {
49+
const customSplitter = { splitDocuments: jest.fn(async (docs) => docs) };
50+
const context = {
51+
getNode: jest.fn(() => ({ typeVersion: 1.1 })),
52+
getNodeParameter: jest.fn().mockImplementation((paramName, _itemIndex) => {
53+
switch (paramName) {
54+
case 'dataType':
55+
return 'json';
56+
case 'textSplittingMode':
57+
return 'custom';
58+
case 'binaryDataKey':
59+
return 'data';
60+
default:
61+
return;
62+
}
63+
}),
64+
getInputConnectionData: jest.fn(async () => customSplitter),
65+
} as unknown as ISupplyDataFunctions;
66+
await loader.supplyData.call(context, 0);
67+
expect(context.getInputConnectionData).toHaveBeenCalledWith(
68+
NodeConnectionTypes.AiTextSplitter,
69+
0,
70+
);
71+
});
72+
});

packages/@n8n/nodes-langchain/nodes/document_loaders/DocumentGithubLoader/DocumentGithubLoader.node.ts

Lines changed: 68 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,45 @@
11
/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
22
import { GithubRepoLoader } from '@langchain/community/document_loaders/web/github';
3-
import type { CharacterTextSplitter } from '@langchain/textsplitters';
3+
import type { TextSplitter } from '@langchain/textsplitters';
4+
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
45
import {
56
NodeConnectionTypes,
67
type INodeType,
78
type INodeTypeDescription,
89
type ISupplyDataFunctions,
910
type SupplyData,
11+
type IDataObject,
12+
type INodeInputConfiguration,
1013
} from 'n8n-workflow';
1114

1215
import { logWrapper } from '@utils/logWrapper';
1316
import { getConnectionHintNoticeField } from '@utils/sharedFields';
1417

18+
function getInputs(parameters: IDataObject) {
19+
const inputs: INodeInputConfiguration[] = [];
20+
21+
const textSplittingMode = parameters?.textSplittingMode;
22+
// If text splitting mode is 'custom' or does not exist (v1), we need to add an input for the text splitter
23+
if (!textSplittingMode || textSplittingMode === 'custom') {
24+
inputs.push({
25+
displayName: 'Text Splitter',
26+
maxConnections: 1,
27+
type: 'ai_textSplitter',
28+
required: true,
29+
});
30+
}
31+
32+
return inputs;
33+
}
34+
1535
export class DocumentGithubLoader implements INodeType {
1636
description: INodeTypeDescription = {
1737
displayName: 'GitHub Document Loader',
1838
name: 'documentGithubLoader',
1939
icon: 'file:github.svg',
2040
group: ['transform'],
21-
version: 1,
41+
version: [1, 1.1],
42+
defaultVersion: 1.1,
2243
description: 'Use GitHub data as input to this chain',
2344
defaults: {
2445
name: 'GitHub Document Loader',
@@ -43,19 +64,38 @@ export class DocumentGithubLoader implements INodeType {
4364
},
4465
],
4566
// eslint-disable-next-line n8n-nodes-base/node-class-description-inputs-wrong-regular-node
46-
inputs: [
47-
{
48-
displayName: 'Text Splitter',
49-
maxConnections: 1,
50-
type: NodeConnectionTypes.AiTextSplitter,
51-
},
52-
],
67+
inputs: `={{ ((parameter) => { ${getInputs.toString()}; return getInputs(parameter) })($parameter) }}`,
5368
inputNames: ['Text Splitter'],
5469
// eslint-disable-next-line n8n-nodes-base/node-class-description-outputs-wrong
5570
outputs: [NodeConnectionTypes.AiDocument],
5671
outputNames: ['Document'],
5772
properties: [
5873
getConnectionHintNoticeField([NodeConnectionTypes.AiVectorStore]),
74+
{
75+
displayName: 'Text Splitting',
76+
name: 'textSplittingMode',
77+
type: 'options',
78+
default: 'simple',
79+
required: true,
80+
noDataExpression: true,
81+
displayOptions: {
82+
show: {
83+
'@version': [1.1],
84+
},
85+
},
86+
options: [
87+
{
88+
name: 'Simple',
89+
value: 'simple',
90+
description: 'Uses Recursive Character Text Splitter with default options',
91+
},
92+
{
93+
name: 'Custom',
94+
value: 'custom',
95+
description: 'Connect a text splitter of your choice',
96+
},
97+
],
98+
},
5999
{
60100
displayName: 'Repository Link',
61101
name: 'repository',
@@ -96,6 +136,7 @@ export class DocumentGithubLoader implements INodeType {
96136

97137
async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
98138
this.logger.debug('Supplying data for Github Document Loader');
139+
const node = this.getNode();
99140

100141
const repository = this.getNodeParameter('repository', itemIndex) as string;
101142
const branch = this.getNodeParameter('branch', itemIndex) as string;
@@ -104,11 +145,25 @@ export class DocumentGithubLoader implements INodeType {
104145
recursive: boolean;
105146
ignorePaths: string;
106147
};
148+
let textSplitter: TextSplitter | undefined;
149+
150+
if (node.typeVersion === 1.1) {
151+
const textSplittingMode = this.getNodeParameter('textSplittingMode', itemIndex, 'simple') as
152+
| 'simple'
153+
| 'custom';
107154

108-
const textSplitter = (await this.getInputConnectionData(
109-
NodeConnectionTypes.AiTextSplitter,
110-
0,
111-
)) as CharacterTextSplitter | undefined;
155+
if (textSplittingMode === 'simple') {
156+
textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
157+
} else if (textSplittingMode === 'custom') {
158+
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
159+
| TextSplitter
160+
| undefined;
161+
}
162+
} else {
163+
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
164+
| TextSplitter
165+
| undefined;
166+
}
112167

113168
const { index } = this.addInputData(NodeConnectionTypes.AiDocument, [
114169
[{ json: { repository, branch, ignorePaths, recursive } }],

0 commit comments

Comments
 (0)