|
import os |
|
import sys |
|
import argparse |
|
|
|
from dotenv import load_dotenv |
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
from src.data_cloud.cloud_utils import StorageHandler |
|
|
|
|
|
def choose_base_dir(cli_base=None): |
|
"""Choose a writable base directory. Preference order: |
|
1. CLI-provided path |
|
2. /data (persistent volume on Spaces) |
|
3. /tmp |
|
""" |
|
candidates = [] |
|
if cli_base: |
|
candidates.append(cli_base) |
|
candidates.extend(['/data', '/tmp']) |
|
|
|
for base in candidates: |
|
try: |
|
merged_path = os.path.abspath(os.path.join(base, 'merged')) |
|
advisorai_path = os.path.abspath(os.path.join(base, 'advisorai-data')) |
|
os.makedirs(merged_path, mode=0o777, exist_ok=True) |
|
os.makedirs(advisorai_path, mode=0o777, exist_ok=True) |
|
|
|
test_file = os.path.join(merged_path, '.write_test') |
|
with open(test_file, 'w') as f: |
|
f.write('ok') |
|
os.remove(test_file) |
|
return base |
|
except Exception: |
|
|
|
continue |
|
|
|
|
|
return '/tmp' |
|
|
|
|
|
def main(argv=None): |
|
parser = argparse.ArgumentParser(description='Fetch data from Filebase/S3 into local disk') |
|
parser.add_argument('--base-dir', help='Base directory to store data (default: auto-detected)') |
|
args = parser.parse_args(argv) |
|
|
|
load_dotenv() |
|
|
|
endpoint_url = os.getenv('FILEBASE_ENDPOINT', 'https://s3.filebase.com') |
|
access_key = os.getenv('FILEBASE_ACCESS_KEY') |
|
secret_key = os.getenv('FILEBASE_SECRET_KEY') |
|
bucket_name = os.getenv('FILEBASE_BUCKET') |
|
|
|
|
|
env_base = os.getenv('DATA_DIR') |
|
if env_base: |
|
base_root = env_base |
|
else: |
|
base_root = choose_base_dir(args.base_dir) |
|
local_base = os.path.abspath(os.path.join(base_root, 'merged')) |
|
advisorai_base = os.path.abspath(os.path.join(base_root, 'advisorai-data')) |
|
|
|
|
|
os.makedirs(local_base, mode=0o777, exist_ok=True) |
|
os.makedirs(advisorai_base, mode=0o777, exist_ok=True) |
|
|
|
storage = StorageHandler(endpoint_url, access_key, secret_key, bucket_name, local_base=local_base) |
|
|
|
|
|
advisor_prefix = "advisorai-data/" |
|
print(f"Fetching all folders/files from: {advisor_prefix}") |
|
advisor_keys = [] |
|
if storage.s3 and bucket_name: |
|
try: |
|
resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=advisor_prefix) |
|
for obj in resp.get('Contents', []): |
|
key = obj['Key'] |
|
if not key.endswith('/'): |
|
advisor_keys.append(key) |
|
except Exception as e: |
|
print(f"[WARN] Could not list objects for {advisor_prefix}: {e}") |
|
else: |
|
print(f"[ERROR] No S3 client or bucket configured for advisorai-data!") |
|
|
|
for key in advisor_keys: |
|
try: |
|
data = storage.download(key) |
|
|
|
local_rel_path = key[len("advisorai-data/"):] if key.startswith("advisorai-data/") else key |
|
local_path = os.path.join(advisorai_base, local_rel_path) |
|
os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True) |
|
with open(local_path, 'wb') as f: |
|
f.write(data) |
|
print(f"[OK] Downloaded advisorai-data/{local_rel_path} from s3://{bucket_name}/{key}") |
|
except Exception as e: |
|
print(f"[ERROR] Failed to fetch advisorai-data file {key}: {e}") |
|
|
|
|
|
|
|
merged_prefix = "merged/" |
|
print(f"Fetching everything under: {merged_prefix} (except only last 7 from archive)") |
|
merged_keys = [] |
|
archive_prefix = "merged/archive/" |
|
archive_folders = set() |
|
archive_keys = [] |
|
if storage.s3 and bucket_name: |
|
try: |
|
resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=merged_prefix) |
|
for obj in resp.get('Contents', []): |
|
key = obj['Key'] |
|
|
|
if key.startswith(archive_prefix): |
|
|
|
parts = key[len(archive_prefix):].split('/') |
|
if len(parts) > 1 and parts[0].isdigit(): |
|
archive_folders.add(parts[0]) |
|
continue |
|
if not key.endswith('/'): |
|
merged_keys.append(key) |
|
except Exception as e: |
|
print(f"[WARN] Could not list objects for {merged_prefix}: {e}") |
|
else: |
|
print(f"[ERROR] No S3 client or bucket configured for merged!") |
|
|
|
|
|
for key in merged_keys: |
|
try: |
|
data = storage.download(key) |
|
local_rel_path = key[len("merged/"):] if key.startswith("merged/") else key |
|
local_path = os.path.join(local_base, local_rel_path) |
|
os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True) |
|
with open(local_path, 'wb') as f: |
|
f.write(data) |
|
print(f"[OK] Downloaded {key} from s3://{bucket_name}/{key}") |
|
except Exception as e: |
|
print(f"[ERROR] Failed to fetch {key}: {e}") |
|
|
|
|
|
archive_prefix = "merged/archive/" |
|
print(f"Fetching last 7 archive folders from: {archive_prefix}") |
|
archive_folders = set() |
|
archive_keys = [] |
|
if storage.s3 and bucket_name: |
|
try: |
|
resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=archive_prefix) |
|
for obj in resp.get('Contents', []): |
|
key = obj['Key'] |
|
|
|
parts = key[len(archive_prefix):].split('/') |
|
if len(parts) > 1 and parts[0].isdigit(): |
|
archive_folders.add(parts[0]) |
|
|
|
last7 = sorted(archive_folders)[-7:] |
|
print(f"[INFO] Last 7 archive folders: {last7}") |
|
|
|
for obj in resp.get('Contents', []): |
|
key = obj['Key'] |
|
parts = key[len(archive_prefix):].split('/') |
|
if len(parts) > 1 and parts[0] in last7: |
|
archive_keys.append(key) |
|
except Exception as e: |
|
print(f"[WARN] Could not list objects for {archive_prefix}: {e}") |
|
else: |
|
print(f"[ERROR] No S3 client or bucket configured for archive!") |
|
|
|
for key in archive_keys: |
|
try: |
|
data = storage.download(key) |
|
local_rel_path = key[len("merged/"):] if key.startswith("merged/") else key |
|
local_path = os.path.join(local_base, local_rel_path) |
|
os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True) |
|
with open(local_path, 'wb') as f: |
|
f.write(data) |
|
print(f"[OK] Downloaded {key} from s3://{bucket_name}/{key}") |
|
except Exception as e: |
|
print(f"[ERROR] Failed to fetch archive file {key}: {e}") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|