scripts/extract_folder.py

#!/usr/bin/env python3
"""
Dump selected repository files into a single text file, honoring .gitignore,
sensible default ignore patterns, and extracting **only text-readable files**
(no binary blobs).

What’s new:
- Much larger default ignore list (build artifacts, caches, lockfiles, etc.).
- Accept kwargs (safe to pass extra keyword arguments).
- Extensions accept "*" to mean "all".
- Excludes can be provided as a space-separated list (CLI) or any of: string with
  commas/whitespace, list/tuple; glob patterns supported (fnmatch).
- **Binary detection**: only text-like files are included (BOM/UTF-8 checks + heuristics).
  You can override with `--include-binary` if ever needed.

Examples:
  # Everything (respecting gitignore + defaults), write to data.txt
  python script.py .

  # Only py,md; space-separated excludes; auto output "data.py"
  python script.py . -e py,md -x migrations tests docs

  # All extensions ("*"), but still allow manual exclude patterns
  python script.py . -e "*" -x "node_modules dist build"

  # With keyword args from another Python module
  from script import structure_directory_content
  structure_directory_content(
      input_dir=".",
      extensions="py,go",
      exclude=["migrations", "node_modules"],
      output_file="repo_dump.txt",
  )
"""
import argparse
import codecs
import fnmatch
import os
import sys
from typing import Iterable, List, Optional, Sequence, Tuple, Union

import pathspec


def _default_garbage_patterns() -> List[str]:
    """
    A broad default set of ignore patterns using gitwildmatch semantics.
    These complement (not replace) patterns found in .gitignore.
    """
    return [
        # VCS
        "**/.git/**",
        "**/.hg/**",
        "**/.svn/**",

        # OS cruft
        "**/.DS_Store",
        "**/Thumbs.db",

        # Node / JS
        "**/node_modules/**",
        "**/package-lock.json",
        "**/yarn.lock",
        "**/pnpm-lock.yaml",
        "**/bun.lockb",
        "**/.yarn/**",
        "**/.pnpm-store/**",
        "**/.parcel-cache/**",
        "**/.next/**",
        "**/.nuxt/**",
        "**/jspm_packages/**",
        "**/bower_components/**",

        # Python
        "**/__pycache__/**",
        "**/*.pyc",
        "**/*.pyo",
        "**/.pytest_cache/**",
        "**/.mypy_cache/**",
        "**/.ruff_cache/**",
        "**/.tox/**",
        "**/.coverage*",
        "**/coverage.xml",
        "**/.cache/**",
        "**/*.egg-info/**",
        "**/.venv/**",
        "uv.lock",
        "**/venv/**",
        "**/env/**",
        "**/.ipynb_checkpoints/**",
        "poetry.lock",
        "Pipfile.lock",

        # Go
        "go.work.sum",
        "go.sum",
        "*_minimock.go",

        # Java / Kotlin / Gradle
        "**/.gradle/**",
        "**/build/**",
        "**/target/**",
        "**/out/**",

        # General IDE / project files
        "**/.idea/**",
        "**/.vscode/**",
        ".next/",
        ".idea/",
        ".gradle/",

        # Web frameworks / uploads
        "static/uploads/*",

        # Terraform / Serverless
        "**/.terraform/**",
        "**/terraform.tfstate*",
        "**/.serverless/**",

        # Compiled / binaries / artifacts
        "**/*.class",
        "**/*.o",
        "**/*.a",
        "**/*.so",
        "**/*.dll",
        "**/*.exe",
        "**/*.dylib",
        "**/*.dSYM/**",
        "**/*.log",

        # Misc build outputs
        "**/dist/**",
        "**/.angular/**",
        "**/.svelte-kit/**",

        # Migrations (from original)
        "migrations/*.py",
        "**/migrations/*.py",

        # Locks from original (explicit)
        "package-lock.json",
    ]


def read_gitignore(input_dir: str, extra_patterns: Optional[Sequence[str]] = None) -> pathspec.PathSpec:
    """
    Build a PathSpec from .gitignore (if present) + default garbage patterns.
    """
    default_patterns = _default_garbage_patterns()
    if extra_patterns:
        default_patterns.extend(extra_patterns)

    # Deduplicate while preserving order
    seen = set()
    merged_defaults = []
    for p in default_patterns:
        if p not in seen:
            seen.add(p)
            merged_defaults.append(p)

    gitignore_path = os.path.join(input_dir, ".gitignore")
    if os.path.isfile(gitignore_path):
        with open(gitignore_path, "r", encoding="utf-8", errors="ignore") as gitignore_file:
            patterns = merged_defaults + list(gitignore_file)
            return pathspec.PathSpec.from_lines("gitwildmatch", patterns)
    else:
        return pathspec.PathSpec.from_lines("gitwildmatch", merged_defaults)


def _parse_extensions(extensions: Optional[Union[str, Iterable[str]]]) -> Optional[List[str]]:
    """
    Normalize extensions.

    Returns:
      None -> accept all
      ["py", "md", ...] -> accept files ending in these extensions (case-insensitive)
    """
    if extensions is None:
        return None

    # If user passed list/tuple/set
    if isinstance(extensions, (list, tuple, set)):
        items = list(extensions)
    else:
        # Accept both comma- and whitespace-separated
        raw = extensions.strip()
        if raw == "" or raw == "*":
            return None
        # split on commas or whitespace
        parts = [p for chunk in raw.split(",") for p in chunk.split()]
        items = [p for p in parts if p]

    # If any item is "*" -> all
    if any(it == "*" for it in items):
        return None

    # Normalize: strip leading dots and lower
    norm = [it.lower().lstrip(".") for it in items if it.strip() != ""]
    return norm or None


def _parse_exclude(exclude: Optional[Union[str, Iterable[str]]]) -> List[str]:
    """
    Normalize exclude patterns. Supports:
    - string with commas and/or whitespace separators
    - list/tuple/set of strings
    Patterns can be substrings or globs (fnmatch).
    """
    if exclude is None:
        return []
    if isinstance(exclude, (list, tuple, set)):
        items = [str(x) for x in exclude]
    else:
        raw = exclude.strip()
        if not raw:
            return []
        # split on commas first, then on whitespace inside each token
        items = [p for chunk in raw.split(",") for p in chunk.split()]
    # remove empties, dedupe preserve order
    seen = set()
    out: List[str] = []
    for it in items:
        if it and it not in seen:
            seen.add(it)
            out.append(it)
    return out


def _matches_extension(filename: str, exts: Optional[Sequence[str]]) -> bool:
    if exts is None:
        return True
    lname = filename.lower()
    for ext in exts:
        if lname.endswith(f".{ext}"):
            return True
    return False


def _nontext_ratio(chunk: bytes) -> float:
    """
    Ratio of control characters (excluding common whitespace).
    Treats bytes 0x20..0xFF as text-like to avoid penalizing non-ASCII.
    """
    if not chunk:
        return 0.0
    allowed = {7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100))
    nontext = sum(1 for b in chunk if b not in allowed)
    return nontext / len(chunk)


def _probe_text_encoding(path: str, blocksize: int = 8192, default: str = "utf-8") -> Tuple[bool, Optional[str]]:
    """
    Heuristically determine if file is text-like and pick a reasonable encoding.

    Strategy:
      - Empty files -> text.
      - NUL byte -> binary.
      - BOMs for UTF-8/16/32 -> text with that encoding.
      - Try UTF-8 decode -> if ok, text (utf-8).
      - Fallback heuristic: control-char ratio < 0.30 -> text (encoding unknown).
    """
    try:
        with open(path, "rb") as f:
            chunk = f.read(blocksize)
    except (PermissionError, IsADirectoryError, FileNotFoundError, OSError):
        return False, None

    if not chunk:
        return True, default

    # Quick binary checks
    if b"\x00" in chunk:
        return False, None

    # BOMs
    if chunk.startswith(codecs.BOM_UTF8):
        return True, "utf-8-sig"
    if chunk.startswith(codecs.BOM_UTF16_LE) or chunk.startswith(codecs.BOM_UTF16_BE):
        return True, "utf-16"
    if chunk.startswith(codecs.BOM_UTF32_LE) or chunk.startswith(codecs.BOM_UTF32_BE):
        return True, "utf-32"

    # Try UTF-8
    try:
        chunk.decode("utf-8")
        return True, "utf-8"
    except UnicodeDecodeError:
        pass

    # Heuristic on control chars
    if _nontext_ratio(chunk) < 0.30:
        return True, None

    return False, None


def _is_excluded(rel_path: str, patterns: Sequence[str]) -> bool:
    """
    Returns True if rel_path should be excluded based on patterns.
    Supports glob-style patterns via fnmatch as well as simple substring checks.
    """
    if not patterns:
        return False
    for pat in patterns:
        # Glob match
        if fnmatch.fnmatch(rel_path, pat):
            return True
        # Substring match
        if pat in rel_path:
            return True
    return False


def structure_directory_content(
    input_dir: str,
    output_file: Optional[str] = None,
    extensions: Optional[Union[str, Iterable[str]]] = None,
    exclude: Optional[Union[str, Iterable[str]]] = None,
    *,
    encoding: str = "utf-8",
    include_binary: bool = False,
    **kwargs,
) -> None:
    """
    Walk input_dir, concatenate **text** file contents into output_file.

    Args:
        input_dir: Directory to scan.
        output_file: Path to write collected contents. Defaults to "data.txt"
                     or "data.<ext>" if a single explicit extension is provided.
        extensions: Comma/space-separated string, iterable, or "*" for all.
        exclude: Comma/space-separated string or iterable of patterns (substr or glob).
        encoding: Default encoding used to read files when detection is inconclusive.
        include_binary: If True, include files even when they look binary (NOT recommended).
        **kwargs: Accepted for forward-compatibility; ignored if unknown.
    """
    gitignore_spec = read_gitignore(input_dir)

    exts = _parse_extensions(extensions)
    excl = _parse_exclude(exclude)

    # Decide output filename
    if not output_file:
        if exts and len(exts) == 1:
            output_file = f"data.{exts[0]}"
        else:
            output_file = "data.txt"

    with open(output_file, "w", encoding=encoding, errors="ignore") as outfile:
        for root, dirs, files in os.walk(input_dir):
            # Prune ignored directories
            dirs[:] = [
                d for d in sorted(dirs)
                if not gitignore_spec.match_file(os.path.join(root, str(d)))
            ]
            # Filter files via .gitignore + defaults
            files = [
                f for f in sorted(files)
                if not gitignore_spec.match_file(os.path.join(root, str(f)))
            ]

            for file in files:
                if not _matches_extension(file, exts):
                    continue

                file_path = os.path.join(root, file)
                relative_path = os.path.relpath(file_path, input_dir)

                if _is_excluded(relative_path, excl):
                    continue

                is_text, detected_enc = _probe_text_encoding(file_path)
                if not is_text and not include_binary:
                    continue

                enc_to_use = detected_enc or encoding
                try:
                    with open(file_path, "r", encoding=enc_to_use, errors="ignore") as infile:
                        data = infile.read()
                except (UnicodeDecodeError, PermissionError, IsADirectoryError, FileNotFoundError, OSError):
                    continue

                outfile.write(f"# {relative_path}\n")
                outfile.write(data)
                outfile.write("\n\n")


def _build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        description="Concatenate repository TEXT files into a single text file (respects .gitignore and sensible defaults)."
    )
    p.add_argument(
        "input_dir",
        nargs="?",
        default=".",
        help="Directory to scan (default: current directory).",
    )
    p.add_argument(
        "-o", "--output-file",
        dest="output_file",
        default=None,
        help="Output file path (default: data.txt or data.<ext> if single extension given).",
    )
    p.add_argument(
        "-e", "--extensions",
        dest="extensions",
        default=None,
        help='Comma/space-separated list of extensions (e.g., "py,md"). Use "*" for all.',
    )
    p.add_argument(
        "-x", "--exclude",
        dest="exclude",
        nargs="*",
        default=None,
        help="Space-separated exclude patterns (substr or glob). Commas within args are also supported.",
    )
    p.add_argument(
        "--encoding",
        dest="encoding",
        default="utf-8",
        help="Default encoding used to read files (fallback when detection is inconclusive).",
    )
    p.add_argument(
        "--include-binary",
        dest="include_binary",
        action="store_true",
        help="Include files even if they appear binary (NOT recommended).",
    )
    return p


def _from_cli(argv: Sequence[str]) -> None:
    # Backwards-compatible interactive mode if no CLI args
    if len(argv) == 1:
        input_directory = input("directory path: ").strip() or "."
        output_filename = input("output file name (optional): ").strip()
        file_extensions = input('file extensions separated by commas or spaces (optional, "*" for all): ').strip()
        exclude_patterns_input = input("exclude patterns separated by spaces or commas (optional): ").strip()

        structure_directory_content(
            input_dir=input_directory,
            output_file=output_filename if output_filename else None,
            extensions=file_extensions if file_extensions else None,
            exclude=exclude_patterns_input if exclude_patterns_input else None,
        )
        return

    parser = _build_arg_parser()
    args = parser.parse_args(argv[1:])

    # If exclude was provided as multiple space-separated args, join back into one string
    # so it can also accept commas inside tokens transparently.
    exclude_arg: Optional[Union[str, Iterable[str]]]
    if args.exclude is None:
        exclude_arg = None
    elif len(args.exclude) == 1:
        exclude_arg = args.exclude[0]
    else:
        exclude_arg = args.exclude  # already a list; parser will handle both

    structure_directory_content(
        input_dir=args.input_dir,
        output_file=args.output_file,
        extensions=args.extensions,
        exclude=exclude_arg,
        encoding=args.encoding,
        include_binary=args.include_binary,
    )


if __name__ == "__main__":
    _from_cli(sys.argv)