@@ -97,8 +97,6 @@ bedrock.S3DataSource(self, 'DataSource',
9797 knowledge_base = kb,
9898 data_source_name = ' books' ,
9999 chunking_strategy = bedrock.ChunkingStrategy.FIXED_SIZE ,
100- max_tokens = 500 ,
101- overlap_percentage = 20
102100)
103101
104102```
@@ -226,8 +224,6 @@ new bedrock.S3DataSource(this, "DataSource", {
226224 knowledgeBase: kb ,
227225 dataSourceName: " books" ,
228226 chunkingStrategy: bedrock .ChunkingStrategy .FIXED_SIZE ,
229- maxTokens: 500 ,
230- overlapPercentage: 20 ,
231227});
232228```
233229
@@ -287,9 +283,7 @@ bedrock.S3DataSource(self, 'DataSource',
287283 bucket = docBucket,
288284 knowledge_base = kb,
289285 data_source_name = ' books' ,
290- chunking_strategy = bedrock.ChunkingStrategy.FIXED_SIZE ,
291- max_tokens = 500 ,
292- overlap_percentage = 20
286+ chunking_strategy = bedrock.ChunkingStrategy.FIXED_SIZE ,
293287)
294288```
295289
@@ -323,8 +317,6 @@ new bedrock.S3DataSource(this, "DataSource", {
323317 knowledgeBase: kb ,
324318 dataSourceName: " books" ,
325319 chunkingStrategy: bedrock .ChunkingStrategy .FIXED_SIZE ,
326- maxTokens: 500 ,
327- overlapPercentage: 20 ,
328320});
329321```
330322
@@ -361,8 +353,6 @@ bedrock.S3DataSource(self, 'DataSource',
361353 knowledge_base = kb,
362354 data_source_name = ' books' ,
363355 chunking_strategy = bedrock.ChunkingStrategy.FIXED_SIZE ,
364- max_tokens = 500 ,
365- overlap_percentage = 20
366356)
367357```
368358
@@ -482,19 +472,139 @@ kb.addSharePointDataSource({
482472});
483473```
484474
475+ Python
476+
477+ ``` python
478+ from aws_cdk import (
479+ Stack,
480+ aws_s3 as s3,
481+ aws_lambda as _lambda,
482+ aws_secretsmanager as secretsmanager,
483+ aws_kms as kms
484+ )
485+ from constructs import Construct
486+ from cdklabs.generative_ai_cdk_constructs import (
487+ bedrock
488+ )
489+
490+ class PythonTestStack (Stack ):
491+
492+ def __init__ (self , scope : Construct, construct_id : str , ** kwargs ) -> None :
493+ super ().__init__ (scope, construct_id, ** kwargs)
494+
495+ kb = bedrock.KnowledgeBase(self , ' MyKnowledgeBase' ,
496+ embeddings_model = bedrock.BedrockFoundationModel.COHERE_EMBED_MULTILINGUAL_V3 ,
497+ )
498+
499+ docBucket = s3.Bucket(self , ' Bucket' )
500+
501+ function = _lambda.Function(self , ' MyFunction' ,
502+ runtime = _lambda.Runtime.PYTHON_3_12 ,
503+ handler = ' index.handler' ,
504+ code = _lambda.Code.from_inline(' print("Hello, World!")' ),
505+ )
506+
507+ kb.add_web_crawler_data_source(
508+ source_urls = [' https://docs.aws.amazon.com/' ],
509+ chunking_strategy = bedrock.ChunkingStrategy.HIERARCHICAL_COHERE ,
510+ custom_transformation = bedrock.CustomTransformation.lambda_(
511+ lambda_function = function,
512+ s3_bucket_uri = f ' s3:// { docBucket.bucket_name} /chunk-processor/ '
513+ )
514+ )
515+
516+ kb.add_s3_data_source(
517+ bucket = docBucket,
518+ chunking_strategy = bedrock.ChunkingStrategy.SEMANTIC ,
519+ parsing_strategy = bedrock.ParsingStategy.foundation_model(
520+ parsing_model = bedrock.BedrockFoundationModel.ANTHROPIC_CLAUDE_3_5_SONNET_V1_0 .as_i_model(self )
521+ )
522+ )
523+
524+ secret = secretsmanager.Secret(self , ' Secret' )
525+ key = kms.Key(self , ' Key' )
526+
527+ kb.add_confluence_data_source(
528+ data_source_name = ' TestDataSource' ,
529+ auth_secret = secret,
530+ kms_key = key,
531+ confluence_url = ' https://example.atlassian.net' ,
532+ filters = [
533+ bedrock.ConfluenceCrawlingFilters(
534+ object_type = bedrock.ConfluenceObjectType.ATTACHMENT ,
535+ include_patterns = [" .*\\ .pdf" ],
536+ exclude_patterns = [" .*private.*\\ .pdf" ],
537+ ),
538+ bedrock.ConfluenceCrawlingFilters(
539+ object_type = bedrock.ConfluenceObjectType.PAGE ,
540+ include_patterns = [" .*public.*\\ .pdf" ],
541+ exclude_patterns = [" .*confidential.*\\ .pdf" ],
542+ ),
543+ ]
544+ )
545+
546+ kb.add_salesforce_data_source(
547+ auth_secret = secret,
548+ endpoint = ' https://your-instance.my.salesforce.com' ,
549+ kms_key = key,
550+ filters = [
551+ bedrock.SalesforceCrawlingFilters(
552+ object_type = bedrock.SalesforceObjectType.ATTACHMENT ,
553+ include_patterns = [" .*\\ .pdf" ],
554+ exclude_patterns = [" .*private.*\\ .pdf" ],
555+ ),
556+ bedrock.SalesforceCrawlingFilters(
557+ object_type = bedrock.SalesforceObjectType.CONTRACT ,
558+ include_patterns = [" .*public.*\\ .pdf" ],
559+ exclude_patterns = [" .*confidential.*\\ .pdf" ],
560+ ),
561+ ]
562+ )
563+
564+ kb.add_share_point_data_source(
565+ data_source_name = ' SharepointDataSource' ,
566+ auth_secret = secret,
567+ kms_key = key,
568+ domain = ' yourDomain' ,
569+ site_urls = [' https://yourdomain.sharepoint.com/sites/mysite' ],
570+ tenant_id = ' 888d0b57-69f1-4fb8-957f-e1f0bedf64de' ,
571+ filters = [
572+ bedrock.SharePointCrawlingFilters(
573+ object_type = bedrock.SharePointObjectType.PAGE ,
574+ include_patterns = [" .*\\ .pdf" ],
575+ exclude_patterns = [" .*private.*\\ .pdf" ],
576+ ),
577+ bedrock.SharePointCrawlingFilters(
578+ object_type = bedrock.SharePointObjectType.FILE ,
579+ include_patterns = [" .*public.*\\ .pdf" ],
580+ exclude_patterns = [" .*confidential.*\\ .pdf" ],
581+ ),
582+ ]
583+ )
584+
585+ ```
586+
485587#### Knowledge Base - Chunking Strategies
486588
487589- ** Default Chunking** : Applies Fixed Chunking with the default chunk size of 300 tokens and 20% overlap.
488590
591+ TypeScript
489592 ``` ts
490593 ChunkingStrategy .DEFAULT ;
491594 ```
492595
596+ Python
597+
598+ ``` python
599+ ChunkingStrategy.DEFAULT ;
600+ ```
601+
493602- ** Fixed Size Chunking** : This method divides the data into fixed-size chunks, with each chunk
494603 containing a predetermined number of tokens. This strategy is useful when the data is uniform
495604 in size and structure.
496605 Typescript
497606
607+ TypeScript
498608 ``` ts
499609 // Fixed Size Chunking with sane defaults.
500610 ChunkingStrategy .FIXED_SIZE ;
@@ -503,10 +613,24 @@ kb.addSharePointDataSource({
503613 ChunkingStrategy .fixedSize ({ maxTokens: 200 , overlapPercentage: 25 });
504614 ```
505615
616+ Python
617+
618+ ``` python
619+ # Fixed Size Chunking with sane defaults.
620+ ChunkingStrategy.FIXED_SIZE ;
621+
622+ # Fixed Size Chunking with custom values.
623+ ChunkingStrategy.fixed_size(
624+ max_tokens = 200 ,
625+ overlap_percentage = 25
626+ )
627+ ```
628+
506629- ** Hierarchical Chunking** : This strategy organizes data into layers of chunks, with the first
507630 layer containing large chunks and the second layer containing smaller chunks derived from the first.
508631 It is ideal for data with inherent hierarchies or nested structures.
509632
633+ TypeScript
510634 ``` ts
511635 // Hierarchical Chunking with the default for Cohere Models.
512636 ChunkingStrategy .HIERARCHICAL_COHERE ;
@@ -523,10 +647,29 @@ kb.addSharePointDataSource({
523647 });
524648 ```
525649
650+ Python
651+
652+ ``` python
653+ # Hierarchical Chunking with the default for Cohere Models.
654+ ChunkingStrategy.HIERARCHICAL_COHERE
655+
656+ # Hierarchical Chunking with the default for Titan Models.
657+ ChunkingStrategy.HIERARCHICAL_TITAN
658+
659+ # Hierarchical Chunking with custom values. Tthe maximum chunk size depends on the model.
660+ # Amazon Titan Text Embeddings: 8192. Cohere Embed models: 512
661+ chunking_strategy= ChunkingStrategy.hierarchical(
662+ overlap_tokens = 60 ,
663+ max_parent_token_size = 1500 ,
664+ max_child_token_size = 300
665+ )
666+ ```
667+
526668- ** Semantic Chunking** : This method splits data into smaller documents based on groups of similar
527669 content derived from the text using natural language processing. It helps preserve contextual
528670 relationships and ensures accurate and contextually appropriate results.
529671
672+ TypeScript
530673 ``` ts
531674 // Semantic Chunking with sane defaults.
532675 ChunkingStrategy .SEMANTIC ;
@@ -535,13 +678,34 @@ kb.addSharePointDataSource({
535678 ChunkingStrategy .semantic ({ bufferSize: 0 , breakpointPercentileThreshold: 95 , maxTokens: 300 });
536679 ```
537680
681+ Python
682+
683+ ``` python
684+ # Semantic Chunking with sane defaults.
685+ ChunkingStrategy.SEMANTIC
686+
687+ # Semantic Chunking with custom values.
688+ ChunkingStrategy.semantic(
689+ buffer_size = 0 ,
690+ breakpoint_percentile_threshold = 95 ,
691+ max_tokens = 300
692+ )
693+ ```
694+
538695- ** No Chunking** : This strategy treats each file as one chunk. If you choose this option,
539696 you may want to pre-process your documents by splitting them into separate files.
540697
698+ TypeScript
541699 ``` ts
542700 ChunkingStrategy .NONE ;
543701 ```
544702
703+ Python
704+
705+ ``` python
706+ ChunkingStrategy.NONE ;
707+ ```
708+
545709#### Knowledge Base - Parsing Strategy
546710
547711A parsing strategy in Amazon Bedrock is a configuration that determines how the service
@@ -557,12 +721,21 @@ two parsing strategies:
557721 the contents of the document. It is particularly useful for improved processing of PDF files
558722 with tables and images. To use this strategy, set the ` parsingStrategy ` in a data source as below.
559723
724+ TypeScript
560725 ``` ts
561726 bedrock .ParsingStategy .foundationModel ({
562727 model: BedrockFoundationModel .ANTHROPIC_CLAUDE_SONNET_V1_0 .asIModel (stack ),
563728 });
564729 ```
565730
731+ Python
732+
733+ ``` python
734+ bedrock.ParsingStategy.foundation_model(
735+ parsing_model = BedrockFoundationModel.ANTHROPIC_CLAUDE_SONNET_V1_0 .as_i_model(self )
736+ )
737+ ```
738+
566739#### Knowledge Base - Custom Transformation
567740
568741Custom Transformation in Amazon Bedrock is a feature that allows you to create and apply
@@ -572,13 +745,22 @@ Custom Transformation uses AWS Lambda functions to process documents, enabling y
572745perform custom operations such as data extraction, normalization, or enrichment. To
573746create a custom transformation, set the ` customTransformation ` in a data source as below.
574747
575- ``` ts
748+ TypeScript
749+ ``` ts
576750CustomTransformation .lambda ({
577751 lambdaFunction: lambdaFunction ,
578752 s3BucketUri: ` s3://${bucket .bucketName }/chunk-processor/ ` ,
579753}),
580754```
581755
756+ Python
757+ ``` python
758+ CustomTransformation.lambda_(
759+ lambda_function = function,
760+ s3_bucket_uri = f ' s3:// { docBucket.bucket_name} /chunk-processor/ '
761+ )
762+ ```
763+
582764## Agents
583765
584766Enable generative AI applications to execute multistep tasks across company systems and data sources.
0 commit comments