updated extract_folder script

2025-12-02 15:25:23 +03:00 · 2025-11-28 16:04:30 +03:00 · 2025-11-28 16:04:30 +03:00 · 61dd049637
commit 61dd049637
parent 06cd5b7bea
1 changed files with 443 additions and 68 deletions
--- a/extract_folder.py
+++ b/extract_folder.py
@ -1,98 +1,473 @@
+#!/usr/bin/env python3
+"""
+Dump selected repository files into a single text file, honoring .gitignore,
+sensible default ignore patterns, and extracting **only text-readable files**
+(no binary blobs).
+
+What’s new:
+- Much larger default ignore list (build artifacts, caches, lockfiles, etc.).
+- Accept kwargs (safe to pass extra keyword arguments).
+- Extensions accept "*" to mean "all".
+- Excludes can be provided as a space-separated list (CLI) or any of: string with
+  commas/whitespace, list/tuple; glob patterns supported (fnmatch).
+- **Binary detection**: only text-like files are included (BOM/UTF-8 checks + heuristics).
+  You can override with `--include-binary` if ever needed.
+
+Examples:
+  # Everything (respecting gitignore + defaults), write to data.txt
+  python script.py .
+
+  # Only py,md; space-separated excludes; auto output "data.py"
+  python script.py . -e py,md -x migrations tests docs
+
+  # All extensions ("*"), but still allow manual exclude patterns
+  python script.py . -e "*" -x "node_modules dist build"
+
+  # With keyword args from another Python module
+  from script import structure_directory_content
+  structure_directory_content(
+      input_dir=".",
+      extensions="py,go",
+      exclude=["migrations", "node_modules"],
+      output_file="repo_dump.txt",
+  )
+"""
+import argparse
+import codecs
+import fnmatch
 import os
 import sys
+from typing import Iterable, List, Optional, Sequence, Tuple, Union
+
 import pathspec

-def read_gitignore(input_dir):
+
+def _default_garbage_patterns() -> List[str]:
    """
-    Reads the .gitignore file in the input directory and returns a pathspec object.
+    A broad default set of ignore patterns using gitwildmatch semantics.
+    These complement (not replace) patterns found in .gitignore.
    """
-    # Default patterns to always ignore
-    default_patterns = [
-        "migrations/*.py",  # Ignore Django migration files
-        "node_modules/",    # Ignore node_modules directory
+    return [
+        # VCS
+        "**/.git/**",
+        "**/.hg/**",
+        "**/.svn/**",
+
+        # OS cruft
+        "**/.DS_Store",
+        "**/Thumbs.db",
+
+        # Node / JS
+        "**/node_modules/**",
+        "**/package-lock.json",
+        "**/yarn.lock",
+        "**/pnpm-lock.yaml",
+        "**/bun.lockb",
+        "**/.yarn/**",
+        "**/.pnpm-store/**",
+        "**/.parcel-cache/**",
+        "**/.next/**",
+        "**/.nuxt/**",
+        "**/jspm_packages/**",
+        "**/bower_components/**",
+
+        # Python
+        "**/__pycache__/**",
+        "**/*.pyc",
+        "**/*.pyo",
+        "**/.pytest_cache/**",
+        "**/.mypy_cache/**",
+        "**/.ruff_cache/**",
+        "**/.tox/**",
+        "**/.coverage*",
+        "**/coverage.xml",
+        "**/.cache/**",
+        "**/*.egg-info/**",
+        "**/.venv/**",
+        "uv.lock",
+        "**/venv/**",
+        "**/env/**",
+        "**/.ipynb_checkpoints/**",
+        "poetry.lock",
+        "Pipfile.lock",
+
+        # Go
+        "go.work.sum",
+        "go.sum",
+        "*_minimock.go",
+
+        # Java / Kotlin / Gradle
+        "**/.gradle/**",
+        "**/build/**",
+        "**/target/**",
+        "**/out/**",
+
+        # General IDE / project files
+        "**/.idea/**",
+        "**/.vscode/**",
+        ".next/",
+        ".idea/",
+        ".gradle/",
+
+        # Web frameworks / uploads
+        "static/uploads/*",
+
+        # Terraform / Serverless
+        "**/.terraform/**",
+        "**/terraform.tfstate*",
+        "**/.serverless/**",
+
+        # Compiled / binaries / artifacts
+        "**/*.class",
+        "**/*.o",
+        "**/*.a",
+        "**/*.so",
+        "**/*.dll",
+        "**/*.exe",
+        "**/*.dylib",
+        "**/*.dSYM/**",
+        "**/*.log",
+
+        # Misc build outputs
+        "**/dist/**",
+        "**/.angular/**",
+        "**/.svelte-kit/**",
+
+        # Migrations (from original)
+        "migrations/*.py",
+        "**/migrations/*.py",
+
+        # Locks from original (explicit)
+        "package-lock.json",
    ]
-    
+
+
+def read_gitignore(input_dir: str, extra_patterns: Optional[Sequence[str]] = None) -> pathspec.PathSpec:
+    """
+    Build a PathSpec from .gitignore (if present) + default garbage patterns.
+    """
+    default_patterns = _default_garbage_patterns()
+    if extra_patterns:
+        default_patterns.extend(extra_patterns)
+
+    # Deduplicate while preserving order
+    seen = set()
+    merged_defaults = []
+    for p in default_patterns:
+        if p not in seen:
+            seen.add(p)
+            merged_defaults.append(p)
+
    gitignore_path = os.path.join(input_dir, ".gitignore")
    if os.path.isfile(gitignore_path):
-        with open(gitignore_path, "r") as gitignore_file:
-            patterns = default_patterns + list(gitignore_file)
+        with open(gitignore_path, "r", encoding="utf-8", errors="ignore") as gitignore_file:
+            patterns = merged_defaults + list(gitignore_file)
            return pathspec.PathSpec.from_lines("gitwildmatch", patterns)
    else:
-        return pathspec.PathSpec.from_lines("gitwildmatch", default_patterns)
+        return pathspec.PathSpec.from_lines("gitwildmatch", merged_defaults)

-def should_ignore_path(path, gitignore_spec):
+
+def _parse_extensions(extensions: Optional[Union[str, Iterable[str]]]) -> Optional[List[str]]:
    """
-    Additional check for paths that should be ignored.
+    Normalize extensions.
+
+    Returns:
+      None -> accept all
+      ["py", "md", ...] -> accept files ending in these extensions (case-insensitive)
    """
-    # Check if the path contains node_modules or migrations directory
-    if "node_modules" in path.split(os.sep) or \
-       ("migrations" in path.split(os.sep) and path.endswith(".py")):
+    if extensions is None:
+        return None
+
+    # If user passed list/tuple/set
+    if isinstance(extensions, (list, tuple, set)):
+        items = list(extensions)
+    else:
+        # Accept both comma- and whitespace-separated
+        raw = extensions.strip()
+        if raw == "" or raw == "*":
+            return None
+        # split on commas or whitespace
+        parts = [p for chunk in raw.split(",") for p in chunk.split()]
+        items = [p for p in parts if p]
+
+    # If any item is "*" -> all
+    if any(it == "*" for it in items):
+        return None
+
+    # Normalize: strip leading dots and lower
+    norm = [it.lower().lstrip(".") for it in items if it.strip() != ""]
+    return norm or None
+
+
+def _parse_exclude(exclude: Optional[Union[str, Iterable[str]]]) -> List[str]:
+    """
+    Normalize exclude patterns. Supports:
+    - string with commas and/or whitespace separators
+    - list/tuple/set of strings
+    Patterns can be substrings or globs (fnmatch).
+    """
+    if exclude is None:
+        return []
+    if isinstance(exclude, (list, tuple, set)):
+        items = [str(x) for x in exclude]
+    else:
+        raw = exclude.strip()
+        if not raw:
+            return []
+        # split on commas first, then on whitespace inside each token
+        items = [p for chunk in raw.split(",") for p in chunk.split()]
+    # remove empties, dedupe preserve order
+    seen = set()
+    out: List[str] = []
+    for it in items:
+        if it and it not in seen:
+            seen.add(it)
+            out.append(it)
+    return out
+
+
+def _matches_extension(filename: str, exts: Optional[Sequence[str]]) -> bool:
+    if exts is None:
        return True
-    return gitignore_spec.match_file(path)
+    lname = filename.lower()
+    for ext in exts:
+        if lname.endswith(f".{ext}"):
+            return True
+    return False

-def structure_directory_content(input_dir, output_file=None, extensions=None):
+
+def _nontext_ratio(chunk: bytes) -> float:
    """
-    This function goes through the input directory recursively and structures
-    all the file contents into one output file based on the given extensions.
-    :param input_dir: The input directory to search for files.
-    :param output_file: The output file where the content will be structured.
-                        If None, 'data.txt' or 'data.<extension>' will be used.
-    :param extensions: A list of file extensions to include. If None, all files are included.
+    Ratio of control characters (excluding common whitespace).
+    Treats bytes 0x20..0xFF as text-like to avoid penalizing non-ASCII.
+    """
+    if not chunk:
+        return 0.0
+    allowed = {7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100))
+    nontext = sum(1 for b in chunk if b not in allowed)
+    return nontext / len(chunk)
+
+
+def _probe_text_encoding(path: str, blocksize: int = 8192, default: str = "utf-8") -> Tuple[bool, Optional[str]]:
+    """
+    Heuristically determine if file is text-like and pick a reasonable encoding.
+
+    Strategy:
+      - Empty files -> text.
+      - NUL byte -> binary.
+      - BOMs for UTF-8/16/32 -> text with that encoding.
+      - Try UTF-8 decode -> if ok, text (utf-8).
+      - Fallback heuristic: control-char ratio < 0.30 -> text (encoding unknown).
+    """
+    try:
+        with open(path, "rb") as f:
+            chunk = f.read(blocksize)
+    except (PermissionError, IsADirectoryError, FileNotFoundError, OSError):
+        return False, None
+
+    if not chunk:
+        return True, default
+
+    # Quick binary checks
+    if b"\x00" in chunk:
+        return False, None
+
+    # BOMs
+    if chunk.startswith(codecs.BOM_UTF8):
+        return True, "utf-8-sig"
+    if chunk.startswith(codecs.BOM_UTF16_LE) or chunk.startswith(codecs.BOM_UTF16_BE):
+        return True, "utf-16"
+    if chunk.startswith(codecs.BOM_UTF32_LE) or chunk.startswith(codecs.BOM_UTF32_BE):
+        return True, "utf-32"
+
+    # Try UTF-8
+    try:
+        chunk.decode("utf-8")
+        return True, "utf-8"
+    except UnicodeDecodeError:
+        pass
+
+    # Heuristic on control chars
+    if _nontext_ratio(chunk) < 0.30:
+        return True, None
+
+    return False, None
+
+
+def _is_excluded(rel_path: str, patterns: Sequence[str]) -> bool:
+    """
+    Returns True if rel_path should be excluded based on patterns.
+    Supports glob-style patterns via fnmatch as well as simple substring checks.
+    """
+    if not patterns:
+        return False
+    for pat in patterns:
+        # Glob match
+        if fnmatch.fnmatch(rel_path, pat):
+            return True
+        # Substring match
+        if pat in rel_path:
+            return True
+    return False
+
+
+def structure_directory_content(
+    input_dir: str,
+    output_file: Optional[str] = None,
+    extensions: Optional[Union[str, Iterable[str]]] = None,
+    exclude: Optional[Union[str, Iterable[str]]] = None,
+    *,
+    encoding: str = "utf-8",
+    include_binary: bool = False,
+    **kwargs,
+) -> None:
+    """
+    Walk input_dir, concatenate **text** file contents into output_file.
+
+    Args:
+        input_dir: Directory to scan.
+        output_file: Path to write collected contents. Defaults to "data.txt"
+                     or "data.<ext>" if a single explicit extension is provided.
+        extensions: Comma/space-separated string, iterable, or "*" for all.
+        exclude: Comma/space-separated string or iterable of patterns (substr or glob).
+        encoding: Default encoding used to read files when detection is inconclusive.
+        include_binary: If True, include files even when they look binary (NOT recommended).
+        **kwargs: Accepted for forward-compatibility; ignored if unknown.
    """
    gitignore_spec = read_gitignore(input_dir)

-    if extensions:
-        extensions = [ext.strip() for ext in extensions.split(",") if ext.strip() != ""]
-        if not output_file and len(extensions) == 1:
-            output_file = f"data.{extensions[0]}"
-    else:
-        extensions = None
+    exts = _parse_extensions(extensions)
+    excl = _parse_exclude(exclude)

+    # Decide output filename
    if not output_file:
-        output_file = "data.txt"
+        if exts and len(exts) == 1:
+            output_file = f"data.{exts[0]}"
+        else:
+            output_file = "data.txt"

-    with open(output_file, "w") as outfile:
+    with open(output_file, "w", encoding=encoding, errors="ignore") as outfile:
        for root, dirs, files in os.walk(input_dir):
-            # Filter files and directories using the enhanced ignore check
-            files = [
-                f for f in files
-                if not should_ignore_path(os.path.join(root, str(f)), gitignore_spec)
-            ]
-            
-            # Filter directories in-place
+            # Prune ignored directories
            dirs[:] = [
-                d for d in dirs
-                if not should_ignore_path(os.path.join(root, str(d)), gitignore_spec)
+                d for d in sorted(dirs)
+                if not gitignore_spec.match_file(os.path.join(root, str(d)))
+            ]
+            # Filter files via .gitignore + defaults
+            files = [
+                f for f in sorted(files)
+                if not gitignore_spec.match_file(os.path.join(root, str(f)))
            ]

            for file in files:
-                if extensions is None or any(
-                    file.endswith(f".{ext}") for ext in extensions
-                ):
-                    file_path = os.path.join(root, file)
-                    try:
-                        with open(file_path, "r") as infile:
-                            data = infile.read()
-                            outfile.write(
-                                f"# {os.path.relpath(file_path, input_dir)}\n"
-                            )
-                            outfile.write(data)
-                            outfile.write("\n\n")
-                    except UnicodeDecodeError:
-                        continue
+                if not _matches_extension(file, exts):
+                    continue
+
+                file_path = os.path.join(root, file)
+                relative_path = os.path.relpath(file_path, input_dir)
+
+                if _is_excluded(relative_path, excl):
+                    continue
+
+                is_text, detected_enc = _probe_text_encoding(file_path)
+                if not is_text and not include_binary:
+                    continue
+
+                enc_to_use = detected_enc or encoding
+                try:
+                    with open(file_path, "r", encoding=enc_to_use, errors="ignore") as infile:
+                        data = infile.read()
+                except (UnicodeDecodeError, PermissionError, IsADirectoryError, FileNotFoundError, OSError):
+                    continue
+
+                outfile.write(f"# {relative_path}\n")
+                outfile.write(data)
+                outfile.write("\n\n")
+
+
+def _build_arg_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="Concatenate repository TEXT files into a single text file (respects .gitignore and sensible defaults)."
+    )
+    p.add_argument(
+        "input_dir",
+        nargs="?",
+        default=".",
+        help="Directory to scan (default: current directory).",
+    )
+    p.add_argument(
+        "-o", "--output-file",
+        dest="output_file",
+        default=None,
+        help="Output file path (default: data.txt or data.<ext> if single extension given).",
+    )
+    p.add_argument(
+        "-e", "--extensions",
+        dest="extensions",
+        default=None,
+        help='Comma/space-separated list of extensions (e.g., "py,md"). Use "*" for all.',
+    )
+    p.add_argument(
+        "-x", "--exclude",
+        dest="exclude",
+        nargs="*",
+        default=None,
+        help="Space-separated exclude patterns (substr or glob). Commas within args are also supported.",
+    )
+    p.add_argument(
+        "--encoding",
+        dest="encoding",
+        default="utf-8",
+        help="Default encoding used to read files (fallback when detection is inconclusive).",
+    )
+    p.add_argument(
+        "--include-binary",
+        dest="include_binary",
+        action="store_true",
+        help="Include files even if they appear binary (NOT recommended).",
+    )
+    return p
+
+
+def _from_cli(argv: Sequence[str]) -> None:
+    # Backwards-compatible interactive mode if no CLI args
+    if len(argv) == 1:
+        input_directory = input("directory path: ").strip() or "."
+        output_filename = input("output file name (optional): ").strip()
+        file_extensions = input('file extensions separated by commas or spaces (optional, "*" for all): ').strip()
+        exclude_patterns_input = input("exclude patterns separated by spaces or commas (optional): ").strip()
+
+        structure_directory_content(
+            input_dir=input_directory,
+            output_file=output_filename if output_filename else None,
+            extensions=file_extensions if file_extensions else None,
+            exclude=exclude_patterns_input if exclude_patterns_input else None,
+        )
+        return
+
+    parser = _build_arg_parser()
+    args = parser.parse_args(argv[1:])
+
+    # If exclude was provided as multiple space-separated args, join back into one string
+    # so it can also accept commas inside tokens transparently.
+    exclude_arg: Optional[Union[str, Iterable[str]]]
+    if args.exclude is None:
+        exclude_arg = None
+    elif len(args.exclude) == 1:
+        exclude_arg = args.exclude[0]
+    else:
+        exclude_arg = args.exclude  # already a list; parser will handle both
+
+    structure_directory_content(
+        input_dir=args.input_dir,
+        output_file=args.output_file,
+        extensions=args.extensions,
+        exclude=exclude_arg,
+        encoding=args.encoding,
+        include_binary=args.include_binary,
+    )
+

 if __name__ == "__main__":
-    if len(sys.argv) == 1:
-        input_directory = input("directory path: ")
-        output_filename = input("output file name (optional): ")
-        file_extensions = input("file extensions separated by commas (optional): ")
-        structure_directory_content(
-            input_directory,
-            output_filename if output_filename else None,
-            file_extensions if file_extensions else None,
-        )
-    else:
-        input_directory = sys.argv[1] if len(sys.argv) > 1 else "."
-        output_filename = sys.argv[2] if len(sys.argv) > 2 else None
-        file_extensions = sys.argv[3] if len(sys.argv) > 3 else None
-        structure_directory_content(input_directory, output_filename, file_extensions)
+    _from_cli(sys.argv)
+