11use std:: { panic, sync:: Arc } ;
22
33use arrow:: datatypes:: SchemaRef ;
4+ use object_store:: { path:: Path , ObjectStoreScheme } ;
45use parquet:: {
56 arrow:: {
67 arrow_to_parquet_schema,
@@ -19,20 +20,76 @@ use url::Url;
1920
2021use crate :: {
2122 arrow_parquet:: parquet_writer:: DEFAULT_ROW_GROUP_SIZE ,
22- object_store:: object_store_cache:: get_or_create_object_store, PG_BACKEND_TOKIO_RUNTIME ,
23+ object_store:: {
24+ aws:: parse_s3_bucket, azure:: parse_azure_blob_container,
25+ object_store_cache:: get_or_create_object_store,
26+ } ,
27+ PG_BACKEND_TOKIO_RUNTIME ,
2328} ;
2429
2530const PARQUET_OBJECT_STORE_READ_ROLE : & str = "parquet_object_store_read" ;
2631const PARQUET_OBJECT_STORE_WRITE_ROLE : & str = "parquet_object_store_write" ;
2732
28- pub ( crate ) fn parse_uri ( uri : & str ) -> Url {
29- if !uri. contains ( "://" ) {
30- // local file
31- return Url :: from_file_path ( uri)
32- . unwrap_or_else ( |_| panic ! ( "not a valid file path: {}" , uri) ) ;
33+ // ParsedUriInfo is a struct that holds the parsed uri information.
34+ #[ derive( Debug , Clone ) ]
35+ pub ( crate ) struct ParsedUriInfo {
36+ pub ( crate ) uri : Url ,
37+ pub ( crate ) bucket : Option < String > ,
38+ pub ( crate ) path : Path ,
39+ pub ( crate ) scheme : ObjectStoreScheme ,
40+ }
41+
42+ impl ParsedUriInfo {
43+ fn try_parse_uri ( uri : & str ) -> Result < Url , String > {
44+ if !uri. contains ( "://" ) {
45+ // local file
46+ Url :: from_file_path ( uri) . map_err ( |_| format ! ( "not a valid file path: {}" , uri) )
47+ } else {
48+ Url :: parse ( uri) . map_err ( |e| e. to_string ( ) )
49+ }
3350 }
3451
35- Url :: parse ( uri) . unwrap_or_else ( |e| panic ! ( "{}" , e) )
52+ fn try_parse_scheme ( uri : & Url ) -> Result < ( ObjectStoreScheme , Path ) , String > {
53+ ObjectStoreScheme :: parse ( uri) . map_err ( |_| {
54+ format ! (
55+ "unrecognized uri {}. pg_parquet supports local paths, s3:// or azure:// schemes." ,
56+ uri
57+ )
58+ } )
59+ }
60+
61+ fn try_parse_bucket ( scheme : & ObjectStoreScheme , uri : & Url ) -> Result < Option < String > , String > {
62+ match scheme {
63+ ObjectStoreScheme :: AmazonS3 => parse_s3_bucket ( uri)
64+ . ok_or ( format ! ( "unsupported s3 uri {uri}" ) )
65+ . map ( Some ) ,
66+ ObjectStoreScheme :: MicrosoftAzure => parse_azure_blob_container ( uri)
67+ . ok_or ( format ! ( "unsupported azure blob storage uri: {uri}" ) )
68+ . map ( Some ) ,
69+ ObjectStoreScheme :: Local => Ok ( None ) ,
70+ _ => Err ( format ! ( "unsupported scheme {} in uri {}. pg_parquet supports local paths, s3:// or azure:// schemes." ,
71+ uri. scheme( ) , uri) )
72+ }
73+ }
74+ }
75+
76+ impl TryFrom < & str > for ParsedUriInfo {
77+ type Error = String ;
78+
79+ fn try_from ( uri : & str ) -> Result < Self , Self :: Error > {
80+ let uri = Self :: try_parse_uri ( uri) ?;
81+
82+ let ( scheme, path) = Self :: try_parse_scheme ( & uri) ?;
83+
84+ let bucket = Self :: try_parse_bucket ( & scheme, & uri) ?;
85+
86+ Ok ( ParsedUriInfo {
87+ uri : uri. clone ( ) ,
88+ bucket,
89+ path,
90+ scheme,
91+ } )
92+ }
3693}
3794
3895pub ( crate ) fn uri_as_string ( uri : & Url ) -> String {
@@ -48,24 +105,27 @@ pub(crate) fn uri_as_string(uri: &Url) -> String {
48105 uri. to_string ( )
49106}
50107
51- pub ( crate ) fn parquet_schema_from_uri ( uri : & Url ) -> SchemaDescriptor {
52- let parquet_reader = parquet_reader_from_uri ( uri ) ;
108+ pub ( crate ) fn parquet_schema_from_uri ( uri_info : ParsedUriInfo ) -> SchemaDescriptor {
109+ let parquet_reader = parquet_reader_from_uri ( uri_info ) ;
53110
54111 let arrow_schema = parquet_reader. schema ( ) ;
55112
56113 arrow_to_parquet_schema ( arrow_schema) . unwrap_or_else ( |e| panic ! ( "{}" , e) )
57114}
58115
59- pub ( crate ) fn parquet_metadata_from_uri ( uri : & Url ) -> Arc < ParquetMetaData > {
116+ pub ( crate ) fn parquet_metadata_from_uri ( uri_info : ParsedUriInfo ) -> Arc < ParquetMetaData > {
60117 let copy_from = true ;
61- let ( parquet_object_store, location) = get_or_create_object_store ( uri , copy_from) ;
118+ let ( parquet_object_store, location) = get_or_create_object_store ( uri_info . clone ( ) , copy_from) ;
62119
63120 PG_BACKEND_TOKIO_RUNTIME . block_on ( async {
64121 let object_store_meta = parquet_object_store
65122 . head ( & location)
66123 . await
67124 . unwrap_or_else ( |e| {
68- panic ! ( "failed to get object store metadata for uri {}: {}" , uri, e)
125+ panic ! (
126+ "failed to get object store metadata for uri {}: {}" ,
127+ uri_info. uri, e
128+ )
69129 } ) ;
70130
71131 let parquet_object_reader =
@@ -79,16 +139,21 @@ pub(crate) fn parquet_metadata_from_uri(uri: &Url) -> Arc<ParquetMetaData> {
79139 } )
80140}
81141
82- pub ( crate ) fn parquet_reader_from_uri ( uri : & Url ) -> ParquetRecordBatchStream < ParquetObjectReader > {
142+ pub ( crate ) fn parquet_reader_from_uri (
143+ uri_info : ParsedUriInfo ,
144+ ) -> ParquetRecordBatchStream < ParquetObjectReader > {
83145 let copy_from = true ;
84- let ( parquet_object_store, location) = get_or_create_object_store ( uri , copy_from) ;
146+ let ( parquet_object_store, location) = get_or_create_object_store ( uri_info . clone ( ) , copy_from) ;
85147
86148 PG_BACKEND_TOKIO_RUNTIME . block_on ( async {
87149 let object_store_meta = parquet_object_store
88150 . head ( & location)
89151 . await
90152 . unwrap_or_else ( |e| {
91- panic ! ( "failed to get object store metadata for uri {}: {}" , uri, e)
153+ panic ! (
154+ "failed to get object store metadata for uri {}: {}" ,
155+ uri_info. uri, e
156+ )
92157 } ) ;
93158
94159 let parquet_object_reader =
@@ -108,17 +173,22 @@ pub(crate) fn parquet_reader_from_uri(uri: &Url) -> ParquetRecordBatchStream<Par
108173}
109174
110175pub ( crate ) fn parquet_writer_from_uri (
111- uri : & Url ,
176+ uri_info : ParsedUriInfo ,
112177 arrow_schema : SchemaRef ,
113178 writer_props : WriterProperties ,
114179) -> AsyncArrowWriter < ParquetObjectWriter > {
115180 let copy_from = false ;
116- let ( parquet_object_store, location) = get_or_create_object_store ( uri , copy_from) ;
181+ let ( parquet_object_store, location) = get_or_create_object_store ( uri_info . clone ( ) , copy_from) ;
117182
118183 let parquet_object_writer = ParquetObjectWriter :: new ( parquet_object_store, location) ;
119184
120185 AsyncArrowWriter :: try_new ( parquet_object_writer, arrow_schema, Some ( writer_props) )
121- . unwrap_or_else ( |e| panic ! ( "failed to create parquet writer for uri {}: {}" , uri, e) )
186+ . unwrap_or_else ( |e| {
187+ panic ! (
188+ "failed to create parquet writer for uri {}: {}" ,
189+ uri_info. uri, e
190+ )
191+ } )
122192}
123193
124194pub ( crate ) fn ensure_access_privilege_to_uri ( uri : & Url , copy_from : bool ) {
0 commit comments