This repository has been archived by the owner on Nov 6, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(s3): use bucket name for data_source_oddrn (#45)
- Loading branch information
Showing
8 changed files
with
494 additions
and
509 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,19 @@ | ||
default_pulling_interval: 10 | ||
token: | ||
# Description: Configuration file for the collector. Find more examples in config_examples folder. | ||
|
||
default_pulling_interval: 60 | ||
token: <token> | ||
platform_host_url: "http://localhost:8080" | ||
plugins: | ||
- type: s3 | ||
name: s3_adapter | ||
aws_access_key_id: | ||
aws_secret_access_key: | ||
datasets: | ||
- bucket: my_bucket | ||
prefix: prefix | ||
# aws_access_key_id: <aws_access_key_id> | ||
# aws_secret_access_key: <aws_secret_access_key> | ||
# aws_region: <aws_region> | ||
# aws_session_token: <aws_session_token> | ||
# aws_role_arn: <aws_role_arn> | ||
# aws_role_session_name: <aws_role_session_name> | ||
# profile_name: <profile_name> | ||
# endpoint_url: <endpoint_url> | ||
dataset_config: | ||
- bucket: <bucket_name> | ||
prefix: <optional_prefix> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,49 @@ | ||
# S3 collector-config.yaml example | ||
# Note: The following example is for AWS S3. For S3 compatible storage, see the example below. | ||
# All AWS S3 parameters are optional according to default behavior of boto3. | ||
# If not provided, boto3 will search for credentials in environment variables, ~/.aws/credentials and ~/.aws/config | ||
|
||
platform_host_url: http://localhost:8080 | ||
default_pulling_interval: 10 # Can be omitted to run collector once | ||
default_pulling_interval: 60 # Pulling interval in minutes. Can be omitted to run collector once | ||
token: "" # Token that must be retrieved from the platform | ||
plugins: | ||
- type: s3 | ||
name: s3_adapter | ||
aws_secret_access_key: <aws_secret_access_key> # Optional. | ||
aws_access_key_id: <aws_access_key_id> # Optional. | ||
aws_session_token: <aws_session_token> # Optional. | ||
aws_session_token: <aws_session_token> # Optional. Required if using temporary credentials. | ||
aws_region: <aws_region> # Optional. | ||
aws_role_arn: <aws_role_arn> # Optional. Required for assuming role with temporary credentials. | ||
aws_role_session_name: <aws_role_session_name> # Optional. Required for assuming role with temporary credentials. | ||
profile_name: <profile_name> # Optional. | ||
filename_filter: # Optional. Default filter allows each file to be ingested to platform. | ||
include: [ '.*.parquet' ] | ||
exclude: [ 'dev_.*' ] | ||
datasets: | ||
# Recursive fetch for all objects in the bucket. | ||
- bucket: my_bucket | ||
# Explicitly specify the prefix to file. | ||
- bucket: my_bucket | ||
prefix: folder/subfolder/file.csv | ||
# When we want to use the folder as a dataset. Very useful for partitioned datasets. | ||
# I.e it can be Hive partitioned dataset with structure like this: | ||
# s3://my_bucket/partitioned_data/year=2019/month=01/... | ||
- bucket: my_bucket | ||
prefix: partitioned_data/ | ||
folder_as_dataset: | ||
file_format: parquet | ||
flavor: hive | ||
|
||
#field_names must be provided if partition flavor was not used. I.e for structure like this: | ||
# s3://my_bucket/partitioned_data/year/... | ||
- bucket: my_bucket | ||
prefix: partitioned_data/ | ||
folder_as_dataset: | ||
file_format: csv | ||
field_names: ['year'] | ||
|
||
# S3 compatible collector-config.yaml example, for example for Minio we need to specify endpoint_url | ||
platform_host_url: "http://localhost:8080" | ||
default_pulling_interval: 10 | ||
token: "" | ||
plugins: | ||
dataset_config: | ||
bucket: my_bucket | ||
prefix: folder/subfolder/file.csv # Optional. Default is empty string. | ||
# When we want to use the folder as a dataset. Very useful for partitioned datasets. | ||
- type: s3 | ||
name: s3_partitioned_adapter | ||
aws_secret_access_key: <aws_secret_access_key> # Optional. | ||
aws_access_key_id: <aws_access_key_id> # Optional. | ||
aws_session_token: <aws_session_token> # Optional. Required if using temporary credentials. | ||
aws_region: <aws_region> # Optional. | ||
aws_role_arn: <aws_role_arn> # Optional. Required for assuming role with temporary credentials. | ||
aws_role_session_name: <aws_role_session_name> # Optional. Required for assuming role with temporary credentials. | ||
profile_name: <profile_name> # Optional. | ||
dataset_config: | ||
bucket: my_bucket | ||
prefix: partitioned_data/ | ||
folder_as_dataset: | ||
file_format: parquet # Format of the files in the folder. Can be parquet csv, tsv. | ||
flavor: hive # Optional. Default is hive. Can be hive or presto. | ||
field_names: ['year', 'month'] # Optional. Must be provided if flavor is other than hive. I.e. structure s3://my_bucket/partitioned_data/year/... | ||
# When S3 storage is compatible with AWS S3 API, for example Minio. | ||
- type: s3 | ||
name: s3_minio_adapter | ||
endpoint_url: http://localhost:9000 | ||
aws_secret_access_key: minioadmin | ||
aws_access_key_id: minioadmin | ||
datasets: | ||
- bucket: my_bucket | ||
prefix: partitioned_data | ||
dataset_config: | ||
bucket: my_bucket | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,44 +1,32 @@ | ||
import traceback as tb | ||
from typing import Iterable, Union | ||
|
||
from odd_collector_sdk.domain.adapter import AbstractAdapter | ||
from odd_collector_sdk.domain.adapter import BaseAdapter | ||
from odd_models.models import DataEntityList | ||
from oddrn_generator.generators import Generator, S3Generator | ||
|
||
from odd_collector_aws.domain.plugin import S3Plugin | ||
from odd_collector_aws.utils.create_generator import create_generator | ||
|
||
from .file_system import FileSystem | ||
from .logger import logger | ||
from .mapper.bucket import map_bucket | ||
|
||
|
||
class Adapter(AbstractAdapter): | ||
class Adapter(BaseAdapter): | ||
config: S3Plugin | ||
generator: Union[Generator, S3Generator] | ||
|
||
def __init__(self, config: S3Plugin) -> None: | ||
self.config = config | ||
self.generator = create_generator(S3Generator, config) | ||
super().__init__(config) | ||
self.fs = FileSystem(config) | ||
|
||
def get_data_source_oddrn(self) -> str: | ||
return self.generator.get_data_source_oddrn() | ||
def create_generator(self) -> Generator: | ||
return create_generator(S3Generator, self.config) | ||
|
||
def get_data_entity_list(self) -> Iterable[DataEntityList]: | ||
for dataset_config in self.config.datasets: | ||
try: | ||
bucket = self.fs.get_bucket(dataset_config) | ||
data_entities = map_bucket(bucket, self.generator) | ||
|
||
yield DataEntityList( | ||
data_source_oddrn=self.get_data_source_oddrn(), | ||
items=list(data_entities), | ||
) | ||
except Exception as e: | ||
logger.error( | ||
f"Error while processing bucket {dataset_config.bucket}: {e}." | ||
" SKIPPING." | ||
) | ||
logger.debug(tb.format_exc()) | ||
continue | ||
bucket = self.fs.get_bucket(self.config.dataset_config) | ||
data_entities = map_bucket(bucket, self.generator) | ||
|
||
yield DataEntityList( | ||
data_source_oddrn=self.get_data_source_oddrn(), | ||
items=list(data_entities), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
from odd_collector_sdk.logger import logger | ||
|
||
logger = logger |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.