#!/usr/bin/env bash
#
# FINAL VERSION: handles date symlinks (20260225 -> SO_ALL_LZ_...), deduplicates,
# case-insensitive g005/g006 search, config exclusions, stats

set -u

# ==================== CONFIG ====================
EXCLUDE_SUBDIRS=(
  ".AppleDouble"
    "TestData"
    "byapid"
    "danfiles"
    "2002"
)
# ==================== END CONFIG ====================

TOP_DIR="${1:-.}"

if [[ ! -d "$TOP_DIR" ]]; then
    echo "Error: '$TOP_DIR' is not a directory" >&2
    exit 1
fi

TOP_DIR=$(realpath "$TOP_DIR" 2>/dev/null || echo "$TOP_DIR")

echo "Scanning: $TOP_DIR"
echo "Handling symlinks (e.g. 20260225 -> SO_ALL_LZ_...) and real folders"
echo "Deduplicating so each physical folder is checked only once"
echo

# Build exclude pattern
exclude_args=()
for excl in "${EXCLUDE_SUBDIRS[@]}"; do
    exclude_args+=(-o -name "$excl")
done
[[ ${#exclude_args[@]} -gt 0 ]] && exclude_pattern=( \( "${exclude_args[@]:1}" \) -prune -o ) || exclude_pattern=()

# Counters
declare -i total_top=0
declare -i nonempty_g005=0
declare -i nonempty_g006=0
declare -A seen_realpath   # deduplication

echo "Top-level subdirs..."
mapfile -t candidates < <(find -L "$TOP_DIR" -mindepth 1 -maxdepth 1 \( -type d -o -type l \) "${exclude_pattern[@]}" -print 2>/dev/null | sort)

for link in "${candidates[@]}"; do
    # Resolve to real physical path (removes symlinks)
    real=$(realpath -s "$link" 2>/dev/null || echo "$link")

    # Skip if we've already processed this exact physical folder
    if [[ ${seen_realpath[$real]+isset} ]]; then
        # echo "   (skipped duplicate: $(basename "$link") -> $real)"
        continue
    fi

    seen_realpath["$real"]=1

    ((total_top++))
    subname=$(basename "$link")
    printf "       %s  (real path: %s)\n" "$subname" "$(basename "$real")"

    found_any=false

    for pat in g005 g006; do
        # Use -L so it follows any symlinks inside the tree
        mapfile -t gdirs < <(find -L "$link" -type d -iname "$pat" 2>/dev/null | sort)

        (( ${#gdirs[@]} == 0 )) && {
            printf "          no %s anywhere inside\n" "$pat"
            continue
        }

        found_any=true
        printf "          found %s/ (%d location(s))\n" "$pat" "${#gdirs[@]}"

        for gdir in "${gdirs[@]}"; do
            rel_gdir=$(realpath --relative-to="$link" "$gdir" 2>/dev/null || echo "$gdir")
            printf "      at: ./%s\n" "$rel_gdir"

            mapfile -t files < <(find -L "$gdir" -type f -print0 2>/dev/null | sort -z | xargs -0 -I{} realpath --relative-to="$gdir" "{}" 2>/dev/null)

            count=${#files[@]}
            printf "         -- %d files\n" "$count"

            if (( count > 0 )); then
                [[ "${pat,,}" == "g005" ]] && ((nonempty_g005++))
                [[ "${pat,,}" == "g006" ]] && ((nonempty_g006++))
            fi

            for relpath in "${files[@]}"; do
                full="$gdir/$relpath"
                size=$(stat -c %s "$full" 2>/dev/null || stat -f %z "$full" 2>/dev/null || echo 0)
                hsize=$(numfmt --to=iec-i --format="%.1f" "$size" 2>/dev/null || printf "%s B" "$size")
                printf "            -- %s   (%s)\n" "$relpath" "$hsize"
            done
        done
    done

    [[ $found_any == false ]] && echo "   (no g005 or g006 found in this subtree)"
    echo "────────────────────────────────────────"
done

# Stats
echo
echo "═══════════════════════════════════════════════"
echo "               S T A T S"
echo "═══════════════════════════════════════════════"
echo "Top-level subdirs processed        : $total_top"
echo "Non-empty g005 instances           : $nonempty_g005"
echo "Non-empty g006 instances           : $nonempty_g006"
echo "═══════════════════════════════════════════════"
echo "Done."
