From e48038f3c65471aff1074e0f37b7df8937e305b1 Mon Sep 17 00:00:00 2001
From: michael-corey <mcorey@americafirstpolicy.com>
Date: Mon, 20 Apr 2026 16:30:35 -0500
Subject: [PATCH] updating for sas

---
 utils/data_explorer.py | 48 ++++++++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/utils/data_explorer.py b/utils/data_explorer.py
index 6f3cc3b..617a400 100644
--- a/utils/data_explorer.py
+++ b/utils/data_explorer.py
@@ -3,10 +3,15 @@
 Reads a text file containing one S3 prefix per line (paths within the bucket
 configured by the ``S3_BUCKET`` constant), then for each prefix:
 - Lists all objects recursively (via ``list_objects_v2`` paginator)
-- Tests read permission with ``head_object`` on the first file found
+- **Only considers files matching the ``FILE_EXTENSION`` filter** (default
+  ``.sas7bdat``).  All other file types are ignored.
+- Tests read permission with ``head_object`` on the first matching file found
 - Categorises the directory as **Available**, **Blocked**, or **Empty**
 
-Configure the three constants below, then run::
+A directory is considered *empty* if it contains no files matching the
+extension filter, even when other file types are present.
+
+Configure the constants below, then run::
 
     python3 data_explorer.py
 
@@ -39,6 +44,9 @@ except ImportError:
 # Configuration — edit these before running
 # ---------------------------------------------------------------------------
 
+FILE_EXTENSION: str = ".sas7bdat"
+"""Only files whose key ends with this extension (case-insensitive) are considered."""
+
 INPUT_FILE: str = "s3_directories.txt"
 """Path to the text file containing one S3 prefix per line."""
 
@@ -133,20 +141,26 @@ def list_objects(
 ) -> Tuple[str | None, int, int]:
     """Recursively list all objects under *prefix* using streaming counters.
 
+    Only objects whose key ends with ``FILE_EXTENSION`` (case-insensitive) are
+    counted.  All other files are silently skipped.
+
     Returns ``(first_key, file_count, total_size)`` where *first_key* is the
-    key of the first object found (or ``None`` if the prefix is empty),
-    *file_count* is the total number of objects, and *total_size* is the sum
-    of all object sizes in bytes.
+    key of the first matching object found (or ``None`` if no matching files
+    exist), *file_count* is the total number of matching objects, and
+    *total_size* is the sum of their sizes in bytes.
 
     Unlike the previous implementation this never accumulates all keys in
     memory, making it safe for prefixes with millions of objects.
     """
+    ext_lower = FILE_EXTENSION.lower()
     paginator = s3_client.get_paginator("list_objects_v2")
     first_key: str | None = None
     file_count: int = 0
     total_size: int = 0
     for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
         for obj in page.get("Contents", []):
+            if not obj["Key"].lower().endswith(ext_lower):
+                continue
             if first_key is None:
                 first_key = obj["Key"]
             file_count += 1
@@ -183,7 +197,10 @@ def explore_directories(prefixes: List[str]) -> Results:
     total = len(prefixes)
 
     for idx, prefix in enumerate(prefixes, start=1):
-        print(f"[{idx}/{total}] Checking {prefix} ...", file=sys.stderr)
+        print(
+            f"[{idx}/{total}] Checking {prefix} (filtering for {FILE_EXTENSION}) ...",
+            file=sys.stderr,
+        )
 
         # --- Recursive listing ------------------------------------------------
         try:
@@ -207,18 +224,23 @@ def explore_directories(prefixes: List[str]) -> Results:
 
         # --- Permission check -------------------------------------------------
         # Prefer a real object over a zero-byte directory marker (key ending
-        # in "/") for the head_object test.  If every key is a directory
-        # marker, fall back to the first one anyway.
+        # in "/") for the head_object test.  The selected key must also match
+        # the FILE_EXTENSION filter.  If no suitable key is found, fall back
+        # to first_key.
+        ext_lower = FILE_EXTENSION.lower()
         test_key = first_key
         if first_key.endswith("/") and total_size > 0:
-            # Re-scan the first page to find a non-marker key
+            # Re-scan the first page to find a non-marker key matching the extension
             try:
                 probe_paginator = s3.get_paginator("list_objects_v2")
                 for probe_page in probe_paginator.paginate(
                     Bucket=S3_BUCKET, Prefix=prefix, PaginationConfig={"MaxItems": 1000}
                 ):
                     for obj in probe_page.get("Contents", []):
-                        if not (obj["Key"].endswith("/") and obj["Size"] == 0):
+                        if (
+                            not (obj["Key"].endswith("/") and obj["Size"] == 0)
+                            and obj["Key"].lower().endswith(ext_lower)
+                        ):
                             test_key = obj["Key"]
                             break
                     if test_key != first_key:
@@ -256,7 +278,7 @@ def print_results(results: Results) -> None:
     if results.available:
         for d in results.available:
             print(f"  {d.prefix}")
-            print(f"    Files: {d.file_count} | Total Size: {format_size(d.total_size)}")
+            print(f"    {FILE_EXTENSION} files: {d.file_count} | Total Size: {format_size(d.total_size)}")
     else:
         print("  (none)")
 
@@ -267,7 +289,7 @@ def print_results(results: Results) -> None:
         for d in results.blocked:
             if d.file_count:
                 print(f"  {d.prefix}")
-                print(f"    Files found: {d.file_count} | Error: {d.error}")
+                print(f"    {FILE_EXTENSION} files found: {d.file_count} | Error: {d.error}")
             else:
                 print(f"  {d.prefix}")
                 print(f"    Error: {d.error}")
@@ -276,7 +298,7 @@ def print_results(results: Results) -> None:
 
     # --- Empty ---
     print()
-    print(f"--- Empty ({len(results.empty)}) ---")
+    print(f"--- Empty / no {FILE_EXTENSION} files ({len(results.empty)}) ---")
     if results.empty:
         for d in results.empty:
             print(f"  {d.prefix}")