Skip to content

Commit ec5c082

Browse files
committed
Support Google Cloud Storage
1 parent 3ff46d5 commit ec5c082

File tree

11 files changed

+242
-16
lines changed

11 files changed

+242
-16
lines changed

.devcontainer/.env

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,10 @@ AZURE_TEST_CONTAINER_NAME=testcontainer
1818
AZURE_TEST_READ_ONLY_SAS="se=2100-05-05&sp=r&sv=2022-11-02&sr=c&sig=YMPFnAHKe9y0o3hFegncbwQTXtAyvsJEgPB2Ne1b9CQ%3D"
1919
AZURE_TEST_READ_WRITE_SAS="se=2100-05-05&sp=rcw&sv=2022-11-02&sr=c&sig=TPz2jEz0t9L651t6rTCQr%2BOjmJHkM76tnCGdcyttnlA%3D"
2020

21+
# GCS tests
22+
GOOGLE_TEST_BUCKET=testbucket
23+
GOOGLE_SERVICE_ACCOUNT_KEY='{"gcs_base_url": "http://localhost:4443","disable_oauth": true,"client_email": "","private_key_id": "","private_key": ""}'
24+
GOOGLE_SERVICE_ENDPOINT=http://localhost:4443
25+
2126
# Others
2227
RUST_TEST_THREADS=1

.devcontainer/docker-compose.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ services:
1212
- ${USERPROFILE}${HOME}/.gitconfig:/home/rust/.gitconfig:ro
1313
- ${USERPROFILE}${HOME}/.aws:/home/rust/.aws:rw
1414
- ${USERPROFILE}${HOME}/.azure:/home/rust/.azure:rw
15+
- ${USERPROFILE}${HOME}/.config/gcloud:/home/rust/.config/gcloud:rw
1516
- ./entrypoint.sh:/entrypoint.sh
1617
env_file:
1718
- .env
@@ -20,6 +21,7 @@ services:
2021
depends_on:
2122
- minio
2223
- azurite
24+
- fake-gcs-server
2325

2426
minio:
2527
image: minio/minio
@@ -47,3 +49,16 @@ services:
4749
interval: 6s
4850
timeout: 2s
4951
retries: 3
52+
53+
fake-gcs-server:
54+
image: tustvold/fake-gcs-server
55+
env_file:
56+
- .env
57+
network_mode: host
58+
command: -scheme http -public-host localhost:4443
59+
restart: unless-stopped
60+
healthcheck:
61+
test: ["CMD", "nc", "-z", "localhost", "4443"]
62+
interval: 6s
63+
timeout: 2s
64+
retries: 3

.devcontainer/entrypoint.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,8 @@ trap "echo 'Caught termination signal. Exiting...'; exit 0" SIGINT SIGTERM
66
az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING
77
az storage container create -n ${AZURE_TEST_CONTAINER_NAME}2 --connection-string $AZURE_STORAGE_CONNECTION_STRING
88

9+
# create fake-gcs bucket
10+
curl -v -X POST --data-binary "{\"name\":\"$GOOGLE_TEST_BUCKET\"}" -H "Content-Type: application/json" "$GOOGLE_SERVICE_ENDPOINT/storage/v1/b"
11+
curl -v -X POST --data-binary "{\"name\":\"${GOOGLE_TEST_BUCKET}2\"}" -H "Content-Type: application/json" "$GOOGLE_SERVICE_ENDPOINT/storage/v1/b"
12+
913
sleep infinity

.github/workflows/ci.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,22 @@ jobs:
140140
az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING
141141
az storage container create -n ${AZURE_TEST_CONTAINER_NAME}2 --connection-string $AZURE_STORAGE_CONNECTION_STRING
142142
143+
- name: Start fake-gcs-server for Google Cloud Storage emulator tests
144+
run: |
145+
docker run -d \
146+
--env-file .devcontainer/.env \
147+
-p 4443:4443 \
148+
tustvold/fake-gcs-server -scheme http -public-host localhost:4443
149+
150+
while ! curl $GOOGLE_SERVICE_ENDPOINT; do
151+
echo "Waiting for $GOOGLE_SERVICE_ENDPOINT..."
152+
sleep 1
153+
done
154+
155+
# create bucket
156+
curl -v -X POST --data-binary "{\"name\":\"$GOOGLE_TEST_BUCKET\"}" -H "Content-Type: application/json" "$GOOGLE_SERVICE_ENDPOINT/storage/v1/b"
157+
curl -v -X POST --data-binary "{\"name\":\"${GOOGLE_TEST_BUCKET}2\"}" -H "Content-Type: application/json" "$GOOGLE_SERVICE_ENDPOINT/storage/v1/b"
158+
143159
- name: Run tests
144160
run: |
145161
# Run tests with coverage tool

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ aws-credential-types = {version = "1", default-features = false}
2828
azure_storage = {version = "0.21", default-features = false}
2929
futures = "0.3"
3030
home = "0.5"
31-
object_store = {version = "0.11", default-features = false, features = ["aws", "azure"]}
31+
object_store = {version = "0.11", default-features = false, features = ["aws", "azure", "gcp"]}
3232
once_cell = "1"
3333
parquet = {version = "53", default-features = false, features = [
3434
"arrow",

README.md

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ SELECT uri, encode(key, 'escape') as key, encode(value, 'escape') as value FROM
156156
```
157157

158158
## Object Store Support
159-
`pg_parquet` supports reading and writing Parquet files from/to `S3` and `Azure Blob Storage` object stores.
159+
`pg_parquet` supports reading and writing Parquet files from/to `S3`, `Azure Blob Storage` and `Google Cloud Service` object stores.
160160

161161
> [!NOTE]
162162
> To be able to write into a object store location, you need to grant `parquet_object_store_write` role to your current postgres user.
@@ -239,6 +239,28 @@ Supported authorization methods' priority order is shown below:
239239
2. Sas token,
240240
3. Storage key.
241241

242+
#### Google Cloud Storage
243+
244+
The simplest way to configure object storage is by creating a json config file like [`/tmp/gcs.json`]:
245+
246+
```bash
247+
$ cat /tmp/gcs.json
248+
{
249+
"gcs_base_url": "http://localhost:4443",
250+
"disable_oauth": true,
251+
"client_email": "",
252+
"private_key_id": "",
253+
"private_key": ""
254+
}
255+
```
256+
257+
Alternatively, you can use the following environment variables when starting postgres to configure the Google Cloud Storage client:
258+
- `GOOGLE_SERVICE_ACCOUNT_KEY`: json serialized service account key **(only via environment variables)**
259+
- `GOOGLE_SERVICE_ACCOUNT_PATH`: an alternative location for the config file **(only via environment variables)**
260+
261+
Supported Google Cloud Storage uri formats are shown below:
262+
- gs:// \<bucket\> / \<path\>
263+
242264
## Copy Options
243265
`pg_parquet` supports the following options in the `COPY TO` command:
244266
- `format parquet`: you need to specify this option to read or write Parquet files which does not end with `.parquet[.<compression>]` extension,

src/object_store.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ use crate::{
99

1010
pub(crate) mod aws;
1111
pub(crate) mod azure;
12+
pub(crate) mod gcs;
1213
pub(crate) mod local_file;
1314
pub(crate) mod object_store_cache;

src/object_store/gcs.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
use std::sync::Arc;
2+
3+
use object_store::gcp::GoogleCloudStorageBuilder;
4+
use url::Url;
5+
6+
use super::object_store_cache::ObjectStoreWithExpiration;
7+
8+
// create_gcs_object_store a GoogleCloudStorage object store from given uri.
9+
// It is configured by environment variables. Currently, we only support
10+
// following environment variables:
11+
// - GOOGLE_SERVICE_ACCOUNT_KEY
12+
// - GOOGLE_SERVICE_ACCOUNT_PATH
13+
pub(crate) fn create_gcs_object_store(uri: &Url) -> ObjectStoreWithExpiration {
14+
let bucket_name = parse_gcs_bucket(uri).unwrap_or_else(|| {
15+
panic!("unsupported gcs uri: {}", uri);
16+
});
17+
18+
let mut gcs_builder = GoogleCloudStorageBuilder::new().with_bucket_name(bucket_name);
19+
20+
let gcs_config = GoogleStorageConfig::load();
21+
22+
// service account key
23+
if let Some(service_account_key) = gcs_config.service_account_key {
24+
gcs_builder = gcs_builder.with_service_account_key(&service_account_key);
25+
}
26+
27+
// service account path
28+
if let Some(service_account_path) = gcs_config.service_account_path {
29+
gcs_builder = gcs_builder.with_service_account_path(&service_account_path);
30+
}
31+
32+
let object_store = gcs_builder.build().unwrap_or_else(|e| panic!("{}", e));
33+
34+
// object store handles refreshing bearer token, so we do not need to handle expiry here
35+
let expire_at = None;
36+
37+
ObjectStoreWithExpiration {
38+
object_store: Arc::new(object_store),
39+
expire_at,
40+
}
41+
}
42+
43+
pub(crate) fn parse_gcs_bucket(uri: &Url) -> Option<String> {
44+
let host = uri.host_str()?;
45+
46+
// gs://{bucket}/key
47+
if uri.scheme() == "gs" {
48+
return Some(host.to_string());
49+
}
50+
51+
None
52+
}
53+
54+
// GoogleStorageConfig is a struct that holds the configuration that is
55+
// used to configure the Google Storage object store.
56+
struct GoogleStorageConfig {
57+
service_account_key: Option<String>,
58+
service_account_path: Option<String>,
59+
}
60+
61+
impl GoogleStorageConfig {
62+
// load loads the Google Storage configuration from the environment.
63+
fn load() -> Self {
64+
Self {
65+
service_account_key: std::env::var("GOOGLE_SERVICE_ACCOUNT_KEY").ok(),
66+
service_account_path: std::env::var("GOOGLE_SERVICE_ACCOUNT_PATH").ok(),
67+
}
68+
}
69+
}

src/object_store/object_store_cache.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@ use pgrx::{ereport, PgLogLevel, PgSqlErrorCode};
1111
use url::Url;
1212

1313
use super::{
14-
aws::parse_s3_bucket, azure::parse_azure_blob_container, create_azure_object_store,
15-
create_local_file_object_store, create_s3_object_store,
14+
aws::parse_s3_bucket,
15+
azure::parse_azure_blob_container,
16+
create_azure_object_store, create_local_file_object_store, create_s3_object_store,
17+
gcs::{create_gcs_object_store, parse_gcs_bucket},
1618
};
1719

1820
// OBJECT_STORE_CACHE is a global cache for object stores per Postgres session.
@@ -44,7 +46,7 @@ impl ObjectStoreCache {
4446
fn get_or_create(&mut self, uri: &Url, copy_from: bool) -> (Arc<dyn ObjectStore>, Path) {
4547
let (scheme, path) = ObjectStoreScheme::parse(uri).unwrap_or_else(|_| {
4648
panic!(
47-
"unrecognized uri {}. pg_parquet supports local paths, s3:// or azure:// schemes.",
49+
"unrecognized uri {}. pg_parquet supports local paths, s3://, azure:// or gs:// schemes.",
4850
uri
4951
)
5052
});
@@ -74,13 +76,14 @@ impl ObjectStoreCache {
7476

7577
fn create(scheme: ObjectStoreScheme, uri: &Url, copy_from: bool) -> ObjectStoreWithExpiration {
7678
// object_store crate can recognize a bunch of different schemes and paths, but we only support
77-
// local, azure, and s3 schemes with a subset of all supported paths.
79+
// local, s3, azure and gs schemes with a subset of all supported paths.
7880
match scheme {
7981
ObjectStoreScheme::AmazonS3 => create_s3_object_store(uri),
8082
ObjectStoreScheme::MicrosoftAzure => create_azure_object_store(uri),
83+
ObjectStoreScheme::GoogleCloudStorage => create_gcs_object_store(uri),
8184
ObjectStoreScheme::Local => create_local_file_object_store(uri, copy_from),
8285
_ => panic!(
83-
"unsupported scheme {} in uri {}. pg_parquet supports local paths, s3:// or azure:// schemes.",
86+
"unsupported scheme {} in uri {}. pg_parquet supports local paths, s3://, azure:// or gs:// schemes.",
8487
uri.scheme(),
8588
uri
8689
),
@@ -131,9 +134,10 @@ impl ObjectStoreCacheKey {
131134
let bucket = match scheme {
132135
ObjectStoreScheme::AmazonS3 => parse_s3_bucket(uri).unwrap_or_else(|| panic!("unsupported s3 uri: {uri}")),
133136
ObjectStoreScheme::MicrosoftAzure => parse_azure_blob_container(uri).unwrap_or_else(|| panic!("unsupported azure blob storage uri: {uri}")),
137+
ObjectStoreScheme::GoogleCloudStorage => parse_gcs_bucket(uri).unwrap_or_else(|| panic!("unsupported gs uri: {uri}")),
134138
ObjectStoreScheme::Local => panic!("local paths should not be cached"),
135139
_ => panic!(
136-
"unsupported scheme {} in uri {}. pg_parquet supports local paths, s3:// or azure:// schemes.",
140+
"unsupported scheme {} in uri {}. pg_parquet supports local paths, s3://, azure:// or gs:// schemes.",
137141
uri.scheme(),
138142
uri
139143
),

0 commit comments

Comments
 (0)