diff --git a/extract_folder.py b/extract_folder.py index 0a7ee16..d5ad30f 100644 --- a/extract_folder.py +++ b/extract_folder.py @@ -1,98 +1,473 @@ +#!/usr/bin/env python3 +""" +Dump selected repository files into a single text file, honoring .gitignore, +sensible default ignore patterns, and extracting **only text-readable files** +(no binary blobs). + +What’s new: +- Much larger default ignore list (build artifacts, caches, lockfiles, etc.). +- Accept kwargs (safe to pass extra keyword arguments). +- Extensions accept "*" to mean "all". +- Excludes can be provided as a space-separated list (CLI) or any of: string with + commas/whitespace, list/tuple; glob patterns supported (fnmatch). +- **Binary detection**: only text-like files are included (BOM/UTF-8 checks + heuristics). + You can override with `--include-binary` if ever needed. + +Examples: + # Everything (respecting gitignore + defaults), write to data.txt + python script.py . + + # Only py,md; space-separated excludes; auto output "data.py" + python script.py . -e py,md -x migrations tests docs + + # All extensions ("*"), but still allow manual exclude patterns + python script.py . -e "*" -x "node_modules dist build" + + # With keyword args from another Python module + from script import structure_directory_content + structure_directory_content( + input_dir=".", + extensions="py,go", + exclude=["migrations", "node_modules"], + output_file="repo_dump.txt", + ) +""" +import argparse +import codecs +import fnmatch import os import sys +from typing import Iterable, List, Optional, Sequence, Tuple, Union + import pathspec -def read_gitignore(input_dir): + +def _default_garbage_patterns() -> List[str]: """ - Reads the .gitignore file in the input directory and returns a pathspec object. + A broad default set of ignore patterns using gitwildmatch semantics. + These complement (not replace) patterns found in .gitignore. """ - # Default patterns to always ignore - default_patterns = [ - "migrations/*.py", # Ignore Django migration files - "node_modules/", # Ignore node_modules directory + return [ + # VCS + "**/.git/**", + "**/.hg/**", + "**/.svn/**", + + # OS cruft + "**/.DS_Store", + "**/Thumbs.db", + + # Node / JS + "**/node_modules/**", + "**/package-lock.json", + "**/yarn.lock", + "**/pnpm-lock.yaml", + "**/bun.lockb", + "**/.yarn/**", + "**/.pnpm-store/**", + "**/.parcel-cache/**", + "**/.next/**", + "**/.nuxt/**", + "**/jspm_packages/**", + "**/bower_components/**", + + # Python + "**/__pycache__/**", + "**/*.pyc", + "**/*.pyo", + "**/.pytest_cache/**", + "**/.mypy_cache/**", + "**/.ruff_cache/**", + "**/.tox/**", + "**/.coverage*", + "**/coverage.xml", + "**/.cache/**", + "**/*.egg-info/**", + "**/.venv/**", + "uv.lock", + "**/venv/**", + "**/env/**", + "**/.ipynb_checkpoints/**", + "poetry.lock", + "Pipfile.lock", + + # Go + "go.work.sum", + "go.sum", + "*_minimock.go", + + # Java / Kotlin / Gradle + "**/.gradle/**", + "**/build/**", + "**/target/**", + "**/out/**", + + # General IDE / project files + "**/.idea/**", + "**/.vscode/**", + ".next/", + ".idea/", + ".gradle/", + + # Web frameworks / uploads + "static/uploads/*", + + # Terraform / Serverless + "**/.terraform/**", + "**/terraform.tfstate*", + "**/.serverless/**", + + # Compiled / binaries / artifacts + "**/*.class", + "**/*.o", + "**/*.a", + "**/*.so", + "**/*.dll", + "**/*.exe", + "**/*.dylib", + "**/*.dSYM/**", + "**/*.log", + + # Misc build outputs + "**/dist/**", + "**/.angular/**", + "**/.svelte-kit/**", + + # Migrations (from original) + "migrations/*.py", + "**/migrations/*.py", + + # Locks from original (explicit) + "package-lock.json", ] - + + +def read_gitignore(input_dir: str, extra_patterns: Optional[Sequence[str]] = None) -> pathspec.PathSpec: + """ + Build a PathSpec from .gitignore (if present) + default garbage patterns. + """ + default_patterns = _default_garbage_patterns() + if extra_patterns: + default_patterns.extend(extra_patterns) + + # Deduplicate while preserving order + seen = set() + merged_defaults = [] + for p in default_patterns: + if p not in seen: + seen.add(p) + merged_defaults.append(p) + gitignore_path = os.path.join(input_dir, ".gitignore") if os.path.isfile(gitignore_path): - with open(gitignore_path, "r") as gitignore_file: - patterns = default_patterns + list(gitignore_file) + with open(gitignore_path, "r", encoding="utf-8", errors="ignore") as gitignore_file: + patterns = merged_defaults + list(gitignore_file) return pathspec.PathSpec.from_lines("gitwildmatch", patterns) else: - return pathspec.PathSpec.from_lines("gitwildmatch", default_patterns) + return pathspec.PathSpec.from_lines("gitwildmatch", merged_defaults) -def should_ignore_path(path, gitignore_spec): + +def _parse_extensions(extensions: Optional[Union[str, Iterable[str]]]) -> Optional[List[str]]: """ - Additional check for paths that should be ignored. + Normalize extensions. + + Returns: + None -> accept all + ["py", "md", ...] -> accept files ending in these extensions (case-insensitive) """ - # Check if the path contains node_modules or migrations directory - if "node_modules" in path.split(os.sep) or \ - ("migrations" in path.split(os.sep) and path.endswith(".py")): + if extensions is None: + return None + + # If user passed list/tuple/set + if isinstance(extensions, (list, tuple, set)): + items = list(extensions) + else: + # Accept both comma- and whitespace-separated + raw = extensions.strip() + if raw == "" or raw == "*": + return None + # split on commas or whitespace + parts = [p for chunk in raw.split(",") for p in chunk.split()] + items = [p for p in parts if p] + + # If any item is "*" -> all + if any(it == "*" for it in items): + return None + + # Normalize: strip leading dots and lower + norm = [it.lower().lstrip(".") for it in items if it.strip() != ""] + return norm or None + + +def _parse_exclude(exclude: Optional[Union[str, Iterable[str]]]) -> List[str]: + """ + Normalize exclude patterns. Supports: + - string with commas and/or whitespace separators + - list/tuple/set of strings + Patterns can be substrings or globs (fnmatch). + """ + if exclude is None: + return [] + if isinstance(exclude, (list, tuple, set)): + items = [str(x) for x in exclude] + else: + raw = exclude.strip() + if not raw: + return [] + # split on commas first, then on whitespace inside each token + items = [p for chunk in raw.split(",") for p in chunk.split()] + # remove empties, dedupe preserve order + seen = set() + out: List[str] = [] + for it in items: + if it and it not in seen: + seen.add(it) + out.append(it) + return out + + +def _matches_extension(filename: str, exts: Optional[Sequence[str]]) -> bool: + if exts is None: return True - return gitignore_spec.match_file(path) + lname = filename.lower() + for ext in exts: + if lname.endswith(f".{ext}"): + return True + return False -def structure_directory_content(input_dir, output_file=None, extensions=None): + +def _nontext_ratio(chunk: bytes) -> float: """ - This function goes through the input directory recursively and structures - all the file contents into one output file based on the given extensions. - :param input_dir: The input directory to search for files. - :param output_file: The output file where the content will be structured. - If None, 'data.txt' or 'data.' will be used. - :param extensions: A list of file extensions to include. If None, all files are included. + Ratio of control characters (excluding common whitespace). + Treats bytes 0x20..0xFF as text-like to avoid penalizing non-ASCII. + """ + if not chunk: + return 0.0 + allowed = {7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) + nontext = sum(1 for b in chunk if b not in allowed) + return nontext / len(chunk) + + +def _probe_text_encoding(path: str, blocksize: int = 8192, default: str = "utf-8") -> Tuple[bool, Optional[str]]: + """ + Heuristically determine if file is text-like and pick a reasonable encoding. + + Strategy: + - Empty files -> text. + - NUL byte -> binary. + - BOMs for UTF-8/16/32 -> text with that encoding. + - Try UTF-8 decode -> if ok, text (utf-8). + - Fallback heuristic: control-char ratio < 0.30 -> text (encoding unknown). + """ + try: + with open(path, "rb") as f: + chunk = f.read(blocksize) + except (PermissionError, IsADirectoryError, FileNotFoundError, OSError): + return False, None + + if not chunk: + return True, default + + # Quick binary checks + if b"\x00" in chunk: + return False, None + + # BOMs + if chunk.startswith(codecs.BOM_UTF8): + return True, "utf-8-sig" + if chunk.startswith(codecs.BOM_UTF16_LE) or chunk.startswith(codecs.BOM_UTF16_BE): + return True, "utf-16" + if chunk.startswith(codecs.BOM_UTF32_LE) or chunk.startswith(codecs.BOM_UTF32_BE): + return True, "utf-32" + + # Try UTF-8 + try: + chunk.decode("utf-8") + return True, "utf-8" + except UnicodeDecodeError: + pass + + # Heuristic on control chars + if _nontext_ratio(chunk) < 0.30: + return True, None + + return False, None + + +def _is_excluded(rel_path: str, patterns: Sequence[str]) -> bool: + """ + Returns True if rel_path should be excluded based on patterns. + Supports glob-style patterns via fnmatch as well as simple substring checks. + """ + if not patterns: + return False + for pat in patterns: + # Glob match + if fnmatch.fnmatch(rel_path, pat): + return True + # Substring match + if pat in rel_path: + return True + return False + + +def structure_directory_content( + input_dir: str, + output_file: Optional[str] = None, + extensions: Optional[Union[str, Iterable[str]]] = None, + exclude: Optional[Union[str, Iterable[str]]] = None, + *, + encoding: str = "utf-8", + include_binary: bool = False, + **kwargs, +) -> None: + """ + Walk input_dir, concatenate **text** file contents into output_file. + + Args: + input_dir: Directory to scan. + output_file: Path to write collected contents. Defaults to "data.txt" + or "data." if a single explicit extension is provided. + extensions: Comma/space-separated string, iterable, or "*" for all. + exclude: Comma/space-separated string or iterable of patterns (substr or glob). + encoding: Default encoding used to read files when detection is inconclusive. + include_binary: If True, include files even when they look binary (NOT recommended). + **kwargs: Accepted for forward-compatibility; ignored if unknown. """ gitignore_spec = read_gitignore(input_dir) - if extensions: - extensions = [ext.strip() for ext in extensions.split(",") if ext.strip() != ""] - if not output_file and len(extensions) == 1: - output_file = f"data.{extensions[0]}" - else: - extensions = None + exts = _parse_extensions(extensions) + excl = _parse_exclude(exclude) + # Decide output filename if not output_file: - output_file = "data.txt" + if exts and len(exts) == 1: + output_file = f"data.{exts[0]}" + else: + output_file = "data.txt" - with open(output_file, "w") as outfile: + with open(output_file, "w", encoding=encoding, errors="ignore") as outfile: for root, dirs, files in os.walk(input_dir): - # Filter files and directories using the enhanced ignore check - files = [ - f for f in files - if not should_ignore_path(os.path.join(root, str(f)), gitignore_spec) - ] - - # Filter directories in-place + # Prune ignored directories dirs[:] = [ - d for d in dirs - if not should_ignore_path(os.path.join(root, str(d)), gitignore_spec) + d for d in sorted(dirs) + if not gitignore_spec.match_file(os.path.join(root, str(d))) + ] + # Filter files via .gitignore + defaults + files = [ + f for f in sorted(files) + if not gitignore_spec.match_file(os.path.join(root, str(f))) ] for file in files: - if extensions is None or any( - file.endswith(f".{ext}") for ext in extensions - ): - file_path = os.path.join(root, file) - try: - with open(file_path, "r") as infile: - data = infile.read() - outfile.write( - f"# {os.path.relpath(file_path, input_dir)}\n" - ) - outfile.write(data) - outfile.write("\n\n") - except UnicodeDecodeError: - continue + if not _matches_extension(file, exts): + continue + + file_path = os.path.join(root, file) + relative_path = os.path.relpath(file_path, input_dir) + + if _is_excluded(relative_path, excl): + continue + + is_text, detected_enc = _probe_text_encoding(file_path) + if not is_text and not include_binary: + continue + + enc_to_use = detected_enc or encoding + try: + with open(file_path, "r", encoding=enc_to_use, errors="ignore") as infile: + data = infile.read() + except (UnicodeDecodeError, PermissionError, IsADirectoryError, FileNotFoundError, OSError): + continue + + outfile.write(f"# {relative_path}\n") + outfile.write(data) + outfile.write("\n\n") + + +def _build_arg_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + description="Concatenate repository TEXT files into a single text file (respects .gitignore and sensible defaults)." + ) + p.add_argument( + "input_dir", + nargs="?", + default=".", + help="Directory to scan (default: current directory).", + ) + p.add_argument( + "-o", "--output-file", + dest="output_file", + default=None, + help="Output file path (default: data.txt or data. if single extension given).", + ) + p.add_argument( + "-e", "--extensions", + dest="extensions", + default=None, + help='Comma/space-separated list of extensions (e.g., "py,md"). Use "*" for all.', + ) + p.add_argument( + "-x", "--exclude", + dest="exclude", + nargs="*", + default=None, + help="Space-separated exclude patterns (substr or glob). Commas within args are also supported.", + ) + p.add_argument( + "--encoding", + dest="encoding", + default="utf-8", + help="Default encoding used to read files (fallback when detection is inconclusive).", + ) + p.add_argument( + "--include-binary", + dest="include_binary", + action="store_true", + help="Include files even if they appear binary (NOT recommended).", + ) + return p + + +def _from_cli(argv: Sequence[str]) -> None: + # Backwards-compatible interactive mode if no CLI args + if len(argv) == 1: + input_directory = input("directory path: ").strip() or "." + output_filename = input("output file name (optional): ").strip() + file_extensions = input('file extensions separated by commas or spaces (optional, "*" for all): ').strip() + exclude_patterns_input = input("exclude patterns separated by spaces or commas (optional): ").strip() + + structure_directory_content( + input_dir=input_directory, + output_file=output_filename if output_filename else None, + extensions=file_extensions if file_extensions else None, + exclude=exclude_patterns_input if exclude_patterns_input else None, + ) + return + + parser = _build_arg_parser() + args = parser.parse_args(argv[1:]) + + # If exclude was provided as multiple space-separated args, join back into one string + # so it can also accept commas inside tokens transparently. + exclude_arg: Optional[Union[str, Iterable[str]]] + if args.exclude is None: + exclude_arg = None + elif len(args.exclude) == 1: + exclude_arg = args.exclude[0] + else: + exclude_arg = args.exclude # already a list; parser will handle both + + structure_directory_content( + input_dir=args.input_dir, + output_file=args.output_file, + extensions=args.extensions, + exclude=exclude_arg, + encoding=args.encoding, + include_binary=args.include_binary, + ) + if __name__ == "__main__": - if len(sys.argv) == 1: - input_directory = input("directory path: ") - output_filename = input("output file name (optional): ") - file_extensions = input("file extensions separated by commas (optional): ") - structure_directory_content( - input_directory, - output_filename if output_filename else None, - file_extensions if file_extensions else None, - ) - else: - input_directory = sys.argv[1] if len(sys.argv) > 1 else "." - output_filename = sys.argv[2] if len(sys.argv) > 2 else None - file_extensions = sys.argv[3] if len(sys.argv) > 3 else None - structure_directory_content(input_directory, output_filename, file_extensions) + _from_cli(sys.argv) +