From e48038f3c65471aff1074e0f37b7df8937e305b1 Mon Sep 17 00:00:00 2001 From: michael-corey Date: Mon, 20 Apr 2026 16:30:35 -0500 Subject: [PATCH] updating for sas --- utils/data_explorer.py | 48 ++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/utils/data_explorer.py b/utils/data_explorer.py index 6f3cc3b..617a400 100644 --- a/utils/data_explorer.py +++ b/utils/data_explorer.py @@ -3,10 +3,15 @@ Reads a text file containing one S3 prefix per line (paths within the bucket configured by the ``S3_BUCKET`` constant), then for each prefix: - Lists all objects recursively (via ``list_objects_v2`` paginator) -- Tests read permission with ``head_object`` on the first file found +- **Only considers files matching the ``FILE_EXTENSION`` filter** (default + ``.sas7bdat``). All other file types are ignored. +- Tests read permission with ``head_object`` on the first matching file found - Categorises the directory as **Available**, **Blocked**, or **Empty** -Configure the three constants below, then run:: +A directory is considered *empty* if it contains no files matching the +extension filter, even when other file types are present. + +Configure the constants below, then run:: python3 data_explorer.py @@ -39,6 +44,9 @@ except ImportError: # Configuration — edit these before running # --------------------------------------------------------------------------- +FILE_EXTENSION: str = ".sas7bdat" +"""Only files whose key ends with this extension (case-insensitive) are considered.""" + INPUT_FILE: str = "s3_directories.txt" """Path to the text file containing one S3 prefix per line.""" @@ -133,20 +141,26 @@ def list_objects( ) -> Tuple[str | None, int, int]: """Recursively list all objects under *prefix* using streaming counters. + Only objects whose key ends with ``FILE_EXTENSION`` (case-insensitive) are + counted. All other files are silently skipped. + Returns ``(first_key, file_count, total_size)`` where *first_key* is the - key of the first object found (or ``None`` if the prefix is empty), - *file_count* is the total number of objects, and *total_size* is the sum - of all object sizes in bytes. + key of the first matching object found (or ``None`` if no matching files + exist), *file_count* is the total number of matching objects, and + *total_size* is the sum of their sizes in bytes. Unlike the previous implementation this never accumulates all keys in memory, making it safe for prefixes with millions of objects. """ + ext_lower = FILE_EXTENSION.lower() paginator = s3_client.get_paginator("list_objects_v2") first_key: str | None = None file_count: int = 0 total_size: int = 0 for page in paginator.paginate(Bucket=bucket, Prefix=prefix): for obj in page.get("Contents", []): + if not obj["Key"].lower().endswith(ext_lower): + continue if first_key is None: first_key = obj["Key"] file_count += 1 @@ -183,7 +197,10 @@ def explore_directories(prefixes: List[str]) -> Results: total = len(prefixes) for idx, prefix in enumerate(prefixes, start=1): - print(f"[{idx}/{total}] Checking {prefix} ...", file=sys.stderr) + print( + f"[{idx}/{total}] Checking {prefix} (filtering for {FILE_EXTENSION}) ...", + file=sys.stderr, + ) # --- Recursive listing ------------------------------------------------ try: @@ -207,18 +224,23 @@ def explore_directories(prefixes: List[str]) -> Results: # --- Permission check ------------------------------------------------- # Prefer a real object over a zero-byte directory marker (key ending - # in "/") for the head_object test. If every key is a directory - # marker, fall back to the first one anyway. + # in "/") for the head_object test. The selected key must also match + # the FILE_EXTENSION filter. If no suitable key is found, fall back + # to first_key. + ext_lower = FILE_EXTENSION.lower() test_key = first_key if first_key.endswith("/") and total_size > 0: - # Re-scan the first page to find a non-marker key + # Re-scan the first page to find a non-marker key matching the extension try: probe_paginator = s3.get_paginator("list_objects_v2") for probe_page in probe_paginator.paginate( Bucket=S3_BUCKET, Prefix=prefix, PaginationConfig={"MaxItems": 1000} ): for obj in probe_page.get("Contents", []): - if not (obj["Key"].endswith("/") and obj["Size"] == 0): + if ( + not (obj["Key"].endswith("/") and obj["Size"] == 0) + and obj["Key"].lower().endswith(ext_lower) + ): test_key = obj["Key"] break if test_key != first_key: @@ -256,7 +278,7 @@ def print_results(results: Results) -> None: if results.available: for d in results.available: print(f" {d.prefix}") - print(f" Files: {d.file_count} | Total Size: {format_size(d.total_size)}") + print(f" {FILE_EXTENSION} files: {d.file_count} | Total Size: {format_size(d.total_size)}") else: print(" (none)") @@ -267,7 +289,7 @@ def print_results(results: Results) -> None: for d in results.blocked: if d.file_count: print(f" {d.prefix}") - print(f" Files found: {d.file_count} | Error: {d.error}") + print(f" {FILE_EXTENSION} files found: {d.file_count} | Error: {d.error}") else: print(f" {d.prefix}") print(f" Error: {d.error}") @@ -276,7 +298,7 @@ def print_results(results: Results) -> None: # --- Empty --- print() - print(f"--- Empty ({len(results.empty)}) ---") + print(f"--- Empty / no {FILE_EXTENSION} files ({len(results.empty)}) ---") if results.empty: for d in results.empty: print(f" {d.prefix}")