adding exception counter

This commit is contained in:
michael-corey 2026-04-20 17:02:35 -05:00
parent e48038f3c6
commit 011d8418a6

View File

@ -6,7 +6,9 @@ configured by the ``S3_BUCKET`` constant), then for each prefix:
- **Only considers files matching the ``FILE_EXTENSION`` filter** (default
``.sas7bdat``). All other file types are ignored.
- Tests read permission with ``head_object`` on the first matching file found
- Categorises the directory as **Available**, **Blocked**, or **Empty**
- If the first file is accessible, tests ALL remaining files individually
- Categorises the directory as **Available**, **Blocked**, **Empty**, and
tracks individual file **Exceptions** within available directories
A directory is considered *empty* if it contains no files matching the
extension filter, even when other file types are present.
@ -69,6 +71,9 @@ class AvailableDir:
prefix: str
file_count: int
total_size: int # bytes
accessible_count: int = 0 # files that passed head_object
total_count: int = 0 # total .sas7bdat files found
accessible_size: int = 0 # total size of accessible files only
@dataclass
@ -87,6 +92,15 @@ class EmptyDir:
prefix: str
@dataclass
class ExceptionFile:
"""A specific file that failed permission check within an otherwise available directory."""
prefix: str # the directory prefix
key: str # the full S3 key of the failed file
error: str # the error message
@dataclass
class Results:
"""Aggregated exploration results."""
@ -94,6 +108,7 @@ class Results:
available: List[AvailableDir] = field(default_factory=list)
blocked: List[BlockedDir] = field(default_factory=list)
empty: List[EmptyDir] = field(default_factory=list)
exceptions: List[ExceptionFile] = field(default_factory=list)
# ---------------------------------------------------------------------------
@ -138,34 +153,27 @@ def list_objects(
s3_client: "botocore.client.S3",
bucket: str,
prefix: str,
) -> Tuple[str | None, int, int]:
"""Recursively list all objects under *prefix* using streaming counters.
) -> Tuple[List[Tuple[str, int]], int]:
"""Recursively list all objects under *prefix*.
Only objects whose key ends with ``FILE_EXTENSION`` (case-insensitive) are
counted. All other files are silently skipped.
Returns ``(first_key, file_count, total_size)`` where *first_key* is the
key of the first matching object found (or ``None`` if no matching files
exist), *file_count* is the total number of matching objects, and
*total_size* is the sum of their sizes in bytes.
Unlike the previous implementation this never accumulates all keys in
memory, making it safe for prefixes with millions of objects.
Returns ``(files, total_size)`` where *files* is a list of
``(key, size)`` tuples for every matching object and *total_size* is the
sum of their sizes in bytes.
"""
ext_lower = FILE_EXTENSION.lower()
paginator = s3_client.get_paginator("list_objects_v2")
first_key: str | None = None
file_count: int = 0
files: List[Tuple[str, int]] = []
total_size: int = 0
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
for obj in page.get("Contents", []):
if not obj["Key"].lower().endswith(ext_lower):
continue
if first_key is None:
first_key = obj["Key"]
file_count += 1
files.append((obj["Key"], obj["Size"]))
total_size += obj["Size"]
return first_key, file_count, total_size
return files, total_size
def check_read_permission(
@ -204,7 +212,7 @@ def explore_directories(prefixes: List[str]) -> Results:
# --- Recursive listing ------------------------------------------------
try:
first_key, file_count, total_size = list_objects(s3, S3_BUCKET, prefix)
files, total_size = list_objects(s3, S3_BUCKET, prefix)
except botocore.exceptions.ClientError as exc:
code = exc.response.get("Error", {}).get("Code", "Unknown")
message = exc.response.get("Error", {}).get("Message", str(exc))
@ -218,45 +226,77 @@ def explore_directories(prefixes: List[str]) -> Results:
)
continue
if first_key is None:
if not files:
results.empty.append(EmptyDir(prefix=prefix))
continue
# --- Permission check -------------------------------------------------
file_count = len(files)
# --- Permission check on first file -----------------------------------
# Prefer a real object over a zero-byte directory marker (key ending
# in "/") for the head_object test. The selected key must also match
# the FILE_EXTENSION filter. If no suitable key is found, fall back
# to first_key.
ext_lower = FILE_EXTENSION.lower()
# in "/") for the head_object test.
first_key, _ = files[0]
test_key = first_key
if first_key.endswith("/") and total_size > 0:
# Re-scan the first page to find a non-marker key matching the extension
try:
probe_paginator = s3.get_paginator("list_objects_v2")
for probe_page in probe_paginator.paginate(
Bucket=S3_BUCKET, Prefix=prefix, PaginationConfig={"MaxItems": 1000}
):
for obj in probe_page.get("Contents", []):
if (
not (obj["Key"].endswith("/") and obj["Size"] == 0)
and obj["Key"].lower().endswith(ext_lower)
):
test_key = obj["Key"]
for key, size in files:
if not (key.endswith("/") and size == 0):
test_key = key
break
if test_key != first_key:
break
except Exception:
pass # Fall back to first_key
error = check_read_permission(s3, S3_BUCKET, test_key)
if error is None:
results.available.append(
AvailableDir(prefix=prefix, file_count=file_count, total_size=total_size)
)
else:
if error is not None:
# First file blocked → entire directory is blocked
results.blocked.append(
BlockedDir(prefix=prefix, file_count=file_count, error=error)
)
continue
# --- First file accessible → check ALL remaining files ----------------
accessible_count = 1 # the first (test_key) already passed
accessible_size = 0
dir_exceptions: List[ExceptionFile] = []
# Find the size of the test_key to count it
for key, size in files:
if key == test_key:
accessible_size = size
break
# Build list of remaining files to check
remaining = [(key, size) for key, size in files if key != test_key]
if remaining:
if len(remaining) > 10:
print(
f" Verifying access to {file_count} {FILE_EXTENSION} files in {prefix} ...",
file=sys.stderr,
)
for key, size in remaining:
file_error = check_read_permission(s3, S3_BUCKET, key)
if file_error is None:
accessible_count += 1
accessible_size += size
else:
dir_exceptions.append(
ExceptionFile(prefix=prefix, key=key, error=file_error)
)
else:
# Only one file and it passed
accessible_size = total_size
results.available.append(
AvailableDir(
prefix=prefix,
file_count=file_count,
total_size=total_size,
accessible_count=accessible_count,
total_count=file_count,
accessible_size=accessible_size,
)
)
results.exceptions.extend(dir_exceptions)
return results
@ -278,7 +318,10 @@ def print_results(results: Results) -> None:
if results.available:
for d in results.available:
print(f" {d.prefix}")
print(f" {FILE_EXTENSION} files: {d.file_count} | Total Size: {format_size(d.total_size)}")
print(
f" {FILE_EXTENSION} files: {d.accessible_count}/{d.total_count} accessible"
f" | Total Size: {format_size(d.accessible_size)}"
)
else:
print(" (none)")
@ -296,6 +339,16 @@ def print_results(results: Results) -> None:
else:
print(" (none)")
# --- Exceptions ---
print()
print(f"--- Exceptions ({len(results.exceptions)}) ---")
if results.exceptions:
for exc in results.exceptions:
print(f" {exc.key}")
print(f" Directory: {exc.prefix} | Error: {exc.error}")
else:
print(" (none)")
# --- Empty ---
print()
print(f"--- Empty / no {FILE_EXTENSION} files ({len(results.empty)}) ---")