n8n-io
diff --git a/‎packages/@n8n/nodes-langchain/nodes/document_loaders/DocumentDefaultDataLoader/DocumentDefaultDataLoader.node.ts
Lines changed: 352 additions & 31 deletions b/‎packages/@n8n/nodes-langchain/nodes/document_loaders/DocumentDefaultDataLoader/DocumentDefaultDataLoader.node.ts
Lines changed: 352 additions & 31 deletions
@@ -1,38 +1,359 @@
-import type { INodeTypeBaseDescription, IVersionedNodeType } from 'n8n-workflow';
-import { VersionedNodeType } from 'n8n-workflow';
-
-import { DocumentDefaultDataLoaderV1 } from './V1/DocumentDefaultDataLoaderV1.node';
-import { DocumentDefaultDataLoaderV2 } from './V2/DocumentDefaultDataLoaderV2.node';
-
-export class DocumentDefaultDataLoader extends VersionedNodeType {
-	constructor() {
-		const baseDescription: INodeTypeBaseDescription = {
-			displayName: 'Default Data Loader',
-			name: 'documentDefaultDataLoader',
-			icon: 'file:binary.svg',
-			group: ['transform'],
-			description: 'Load data from previous step in the workflow',
-			codex: {
-				categories: ['AI'],
-				subcategories: {
-					AI: ['Document Loaders'],
+/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
+import { RecursiveCharacterTextSplitter, type TextSplitter } from '@langchain/textsplitters';
+import {
+	NodeConnectionTypes,
+	type INodeType,
+	type INodeTypeDescription,
+	type ISupplyDataFunctions,
+	type SupplyData,
+	type IDataObject,
+	type INodeInputConfiguration,
+} from 'n8n-workflow';
+
+import { logWrapper } from '@utils/logWrapper';
+import { N8nBinaryLoader } from '@utils/N8nBinaryLoader';
+import { N8nJsonLoader } from '@utils/N8nJsonLoader';
+import { metadataFilterField } from '@utils/sharedFields';
+
+// Dependencies needed underneath the hood for the loaders. We add them
+// here only to track where what dependency is sued
+// import 'd3-dsv'; // for csv
+import 'mammoth'; // for docx
+import 'epub2'; // for epub
+import 'pdf-parse'; // for pdf
+
+function getInputs(parameters: IDataObject) {
+	const inputs: INodeInputConfiguration[] = [];
+
+	const textSplittingMode = parameters?.textSplittingMode;
+	// If text splitting mode is 'custom' or does not exist (v1), we need to add an input for the text splitter
+	if (!textSplittingMode || textSplittingMode === 'custom') {
+		inputs.push({
+			displayName: 'Text Splitter',
+			maxConnections: 1,
+			type: 'ai_textSplitter',
+			required: true,
+		});
+	}
+
+	return inputs;
+}
+
+export class DocumentDefaultDataLoader implements INodeType {
+	description: INodeTypeDescription = {
+		displayName: 'Default Data Loader',
+		name: 'documentDefaultDataLoader',
+		icon: 'file:binary.svg',
+		group: ['transform'],
+		version: [1, 1.1],
+		defaultVersion: 1.1,
+		description: 'Load data from previous step in the workflow',
+		defaults: {
+			name: 'Default Data Loader',
+		},
+		codex: {
+			categories: ['AI'],
+			subcategories: {
+				AI: ['Document Loaders'],
+			},
+			resources: {
+				primaryDocumentation: [
+					{
+						url: 'https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.documentdefaultdataloader/',
+					},
+				],
+			},
+		},
+		// eslint-disable-next-line n8n-nodes-base/node-class-description-inputs-wrong-regular-node
+		inputs: `={{ ((parameter) => { ${getInputs.toString()}; return getInputs(parameter) })($parameter) }}`,
+		// eslint-disable-next-line n8n-nodes-base/node-class-description-outputs-wrong
+		outputs: [NodeConnectionTypes.AiDocument],
+		outputNames: ['Document'],
+		properties: [
+			{
+				displayName:
+					'This will load data from a previous step in the workflow. <a href="/templates/1962" target="_blank">Example</a>',
+				name: 'notice',
+				type: 'notice',
+				default: '',
+			},
+			{
+				displayName: 'Text Splitting',
+				name: 'textSplittingMode',
+				type: 'options',
+				default: 'simple',
+				required: true,
+				noDataExpression: true,
+				displayOptions: {
+					show: {
+						'@version': [1.2],
+					},
 				},
-				resources: {
-					primaryDocumentation: [
-						{
-							url: 'https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.documentdefaultdataloader/',
-						},
-					],
+				options: [
+					{
+						name: 'Simple',
+						value: 'simple',
+						description: 'Uses Recursive Character Text Splitter with default options',
+					},
+					{
+						name: 'Custom',
+						value: 'custom',
+						description: 'Connect a text splitter of your choice',
+					},
+				],
+			},
+			{
+				displayName: 'Type of Data',
+				name: 'dataType',
+				type: 'options',
+				default: 'json',
+				required: true,
+				noDataExpression: true,
+				options: [
+					{
+						name: 'JSON',
+						value: 'json',
+						description: 'Process JSON data from previous step in the workflow',
+					},
+					{
+						name: 'Binary',
+						value: 'binary',
+						description: 'Process binary data from previous step in the workflow',
+					},
+				],
+			},
+			{
+				displayName: 'Mode',
+				name: 'jsonMode',
+				type: 'options',
+				default: 'allInputData',
+				required: true,
+				displayOptions: {
+					show: {
+						dataType: ['json'],
+					},
 				},
+				options: [
+					{
+						name: 'Load All Input Data',
+						value: 'allInputData',
+						description: 'Use all JSON data that flows into the parent agent or chain',
+					},
+					{
+						name: 'Load Specific Data',
+						value: 'expressionData',
+						description:
+							'Load a subset of data, and/or data from any previous step in the workflow',
+					},
+				],
 			},
-			defaultVersion: 2,
-		};
+			{
+				displayName: 'Mode',
+				name: 'binaryMode',
+				type: 'options',
+				default: 'allInputData',
+				required: true,
+				displayOptions: {
+					show: {
+						dataType: ['binary'],
+					},
+				},
+				options: [
+					{
+						name: 'Load All Input Data',
+						value: 'allInputData',
+						description: 'Use all Binary data that flows into the parent agent or chain',
+					},
+					{
+						name: 'Load Specific Data',
+						value: 'specificField',
+						description: 'Load data from a specific field in the parent agent or chain',
+					},
+				],
+			},
+			{
+				displayName: 'Data Format',
+				name: 'loader',
+				type: 'options',
+				default: 'auto',
+				required: true,
+				displayOptions: {
+					show: {
+						dataType: ['binary'],
+					},
+				},
+				options: [
+					{
+						name: 'Automatically Detect by Mime Type',
+						value: 'auto',
+						description: 'Uses the mime type to detect the format',
+					},
+					{
+						name: 'CSV',
+						value: 'csvLoader',
+						description: 'Load CSV files',
+					},
+					{
+						name: 'Docx',
+						value: 'docxLoader',
+						description: 'Load Docx documents',
+					},
+					{
+						name: 'EPub',
+						value: 'epubLoader',
+						description: 'Load EPub files',
+					},
+					{
+						name: 'JSON',
+						value: 'jsonLoader',
+						description: 'Load JSON files',
+					},
+					{
+						name: 'PDF',
+						value: 'pdfLoader',
+						description: 'Load PDF documents',
+					},
+					{
+						name: 'Text',
+						value: 'textLoader',
+						description: 'Load plain text files',
+					},
+				],
+			},
+			{
+				displayName: 'Data',
+				name: 'jsonData',
+				type: 'string',
+				typeOptions: {
+					rows: 6,
+				},
+				default: '',
+				required: true,
+				description: 'Drag and drop fields from the input pane, or use an expression',
+				displayOptions: {
+					show: {
+						dataType: ['json'],
+						jsonMode: ['expressionData'],
+					},
+				},
+			},
+			{
+				displayName: 'Input Data Field Name',
+				name: 'binaryDataKey',
+				type: 'string',
+				default: 'data',
+				required: true,
+				description:
+					'The name of the field in the agent or chain’s input that contains the binary file to be processed',
+				displayOptions: {
+					show: {
+						dataType: ['binary'],
+					},
+					hide: {
+						binaryMode: ['allInputData'],
+					},
+				},
+			},
+			{
+				displayName: 'Options',
+				name: 'options',
+				type: 'collection',
+				placeholder: 'Add Option',
+				default: {},
+				options: [
+					{
+						displayName: 'JSON Pointers',
+						name: 'pointers',
+						type: 'string',
+						default: '',
+						description: 'Pointers to extract from JSON, e.g. "/text" or "/text, /meta/title"',
+						displayOptions: {
+							show: {
+								'/loader': ['jsonLoader', 'auto'],
+							},
+						},
+					},
+					{
+						displayName: 'CSV Separator',
+						name: 'separator',
+						type: 'string',
+						description: 'Separator to use for CSV',
+						default: ',',
+						displayOptions: {
+							show: {
+								'/loader': ['csvLoader', 'auto'],
+							},
+						},
+					},
+					{
+						displayName: 'CSV Column',
+						name: 'column',
+						type: 'string',
+						default: '',
+						description: 'Column to extract from CSV',
+						displayOptions: {
+							show: {
+								'/loader': ['csvLoader', 'auto'],
+							},
+						},
+					},
+					{
+						displayName: 'Split Pages in PDF',
+						description: 'Whether to split PDF pages into separate documents',
+						name: 'splitPages',
+						type: 'boolean',
+						default: true,
+						displayOptions: {
+							show: {
+								'/loader': ['pdfLoader', 'auto'],
+							},
+						},
+					},
+					{
+						...metadataFilterField,
+						displayName: 'Metadata',
+						description:
+							'Metadata to add to each document. Could be used for filtering during retrieval',
+						placeholder: 'Add property',
+					},
+				],
+			},
+		],
+	};
 
-		const nodeVersions: IVersionedNodeType['nodeVersions'] = {
-			1: new DocumentDefaultDataLoaderV1(baseDescription),
-			2: new DocumentDefaultDataLoaderV2(baseDescription),
-		};
+	async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
+		const node = this.getNode();
+		const dataType = this.getNodeParameter('dataType', itemIndex, 'json') as 'json' | 'binary';
+
+		let textSplitter: TextSplitter | undefined;
+
+		if (node.typeVersion === 1.1) {
+			const textSplittingMode = this.getNodeParameter('textSplittingMode', itemIndex, 'simple') as
+				| 'simple'
+				| 'custom';
 
-		super(nodeVersions, baseDescription);
+			if (textSplittingMode === 'simple') {
+				textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
+			} else if (textSplittingMode === 'custom') {
+				textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
+					| TextSplitter
+					| undefined;
+			}
+		} else {
+			textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
+				| TextSplitter
+				| undefined;
+		}
+
+		const binaryDataKey = this.getNodeParameter('binaryDataKey', itemIndex, '') as string;
+
+		const processor =
+			dataType === 'binary'
+				? new N8nBinaryLoader(this, 'options.', binaryDataKey, textSplitter)
+				: new N8nJsonLoader(this, 'options.', textSplitter);
+
+		return {
+			response: logWrapper(processor, this),
+		};
 	}
 }