updated extract_folder script

This commit is contained in:
Alexander Karpov 2025-11-28 16:04:30 +03:00
parent 06cd5b7bea
commit 61dd049637

View File

@ -1,98 +1,473 @@
#!/usr/bin/env python3
"""
Dump selected repository files into a single text file, honoring .gitignore,
sensible default ignore patterns, and extracting **only text-readable files**
(no binary blobs).
Whats new:
- Much larger default ignore list (build artifacts, caches, lockfiles, etc.).
- Accept kwargs (safe to pass extra keyword arguments).
- Extensions accept "*" to mean "all".
- Excludes can be provided as a space-separated list (CLI) or any of: string with
commas/whitespace, list/tuple; glob patterns supported (fnmatch).
- **Binary detection**: only text-like files are included (BOM/UTF-8 checks + heuristics).
You can override with `--include-binary` if ever needed.
Examples:
# Everything (respecting gitignore + defaults), write to data.txt
python script.py .
# Only py,md; space-separated excludes; auto output "data.py"
python script.py . -e py,md -x migrations tests docs
# All extensions ("*"), but still allow manual exclude patterns
python script.py . -e "*" -x "node_modules dist build"
# With keyword args from another Python module
from script import structure_directory_content
structure_directory_content(
input_dir=".",
extensions="py,go",
exclude=["migrations", "node_modules"],
output_file="repo_dump.txt",
)
"""
import argparse
import codecs
import fnmatch
import os import os
import sys import sys
from typing import Iterable, List, Optional, Sequence, Tuple, Union
import pathspec import pathspec
def read_gitignore(input_dir):
def _default_garbage_patterns() -> List[str]:
""" """
Reads the .gitignore file in the input directory and returns a pathspec object. A broad default set of ignore patterns using gitwildmatch semantics.
These complement (not replace) patterns found in .gitignore.
""" """
# Default patterns to always ignore return [
default_patterns = [ # VCS
"migrations/*.py", # Ignore Django migration files "**/.git/**",
"node_modules/", # Ignore node_modules directory "**/.hg/**",
"**/.svn/**",
# OS cruft
"**/.DS_Store",
"**/Thumbs.db",
# Node / JS
"**/node_modules/**",
"**/package-lock.json",
"**/yarn.lock",
"**/pnpm-lock.yaml",
"**/bun.lockb",
"**/.yarn/**",
"**/.pnpm-store/**",
"**/.parcel-cache/**",
"**/.next/**",
"**/.nuxt/**",
"**/jspm_packages/**",
"**/bower_components/**",
# Python
"**/__pycache__/**",
"**/*.pyc",
"**/*.pyo",
"**/.pytest_cache/**",
"**/.mypy_cache/**",
"**/.ruff_cache/**",
"**/.tox/**",
"**/.coverage*",
"**/coverage.xml",
"**/.cache/**",
"**/*.egg-info/**",
"**/.venv/**",
"uv.lock",
"**/venv/**",
"**/env/**",
"**/.ipynb_checkpoints/**",
"poetry.lock",
"Pipfile.lock",
# Go
"go.work.sum",
"go.sum",
"*_minimock.go",
# Java / Kotlin / Gradle
"**/.gradle/**",
"**/build/**",
"**/target/**",
"**/out/**",
# General IDE / project files
"**/.idea/**",
"**/.vscode/**",
".next/",
".idea/",
".gradle/",
# Web frameworks / uploads
"static/uploads/*",
# Terraform / Serverless
"**/.terraform/**",
"**/terraform.tfstate*",
"**/.serverless/**",
# Compiled / binaries / artifacts
"**/*.class",
"**/*.o",
"**/*.a",
"**/*.so",
"**/*.dll",
"**/*.exe",
"**/*.dylib",
"**/*.dSYM/**",
"**/*.log",
# Misc build outputs
"**/dist/**",
"**/.angular/**",
"**/.svelte-kit/**",
# Migrations (from original)
"migrations/*.py",
"**/migrations/*.py",
# Locks from original (explicit)
"package-lock.json",
] ]
def read_gitignore(input_dir: str, extra_patterns: Optional[Sequence[str]] = None) -> pathspec.PathSpec:
"""
Build a PathSpec from .gitignore (if present) + default garbage patterns.
"""
default_patterns = _default_garbage_patterns()
if extra_patterns:
default_patterns.extend(extra_patterns)
# Deduplicate while preserving order
seen = set()
merged_defaults = []
for p in default_patterns:
if p not in seen:
seen.add(p)
merged_defaults.append(p)
gitignore_path = os.path.join(input_dir, ".gitignore") gitignore_path = os.path.join(input_dir, ".gitignore")
if os.path.isfile(gitignore_path): if os.path.isfile(gitignore_path):
with open(gitignore_path, "r") as gitignore_file: with open(gitignore_path, "r", encoding="utf-8", errors="ignore") as gitignore_file:
patterns = default_patterns + list(gitignore_file) patterns = merged_defaults + list(gitignore_file)
return pathspec.PathSpec.from_lines("gitwildmatch", patterns) return pathspec.PathSpec.from_lines("gitwildmatch", patterns)
else: else:
return pathspec.PathSpec.from_lines("gitwildmatch", default_patterns) return pathspec.PathSpec.from_lines("gitwildmatch", merged_defaults)
def should_ignore_path(path, gitignore_spec):
def _parse_extensions(extensions: Optional[Union[str, Iterable[str]]]) -> Optional[List[str]]:
""" """
Additional check for paths that should be ignored. Normalize extensions.
Returns:
None -> accept all
["py", "md", ...] -> accept files ending in these extensions (case-insensitive)
""" """
# Check if the path contains node_modules or migrations directory if extensions is None:
if "node_modules" in path.split(os.sep) or \ return None
("migrations" in path.split(os.sep) and path.endswith(".py")):
# If user passed list/tuple/set
if isinstance(extensions, (list, tuple, set)):
items = list(extensions)
else:
# Accept both comma- and whitespace-separated
raw = extensions.strip()
if raw == "" or raw == "*":
return None
# split on commas or whitespace
parts = [p for chunk in raw.split(",") for p in chunk.split()]
items = [p for p in parts if p]
# If any item is "*" -> all
if any(it == "*" for it in items):
return None
# Normalize: strip leading dots and lower
norm = [it.lower().lstrip(".") for it in items if it.strip() != ""]
return norm or None
def _parse_exclude(exclude: Optional[Union[str, Iterable[str]]]) -> List[str]:
"""
Normalize exclude patterns. Supports:
- string with commas and/or whitespace separators
- list/tuple/set of strings
Patterns can be substrings or globs (fnmatch).
"""
if exclude is None:
return []
if isinstance(exclude, (list, tuple, set)):
items = [str(x) for x in exclude]
else:
raw = exclude.strip()
if not raw:
return []
# split on commas first, then on whitespace inside each token
items = [p for chunk in raw.split(",") for p in chunk.split()]
# remove empties, dedupe preserve order
seen = set()
out: List[str] = []
for it in items:
if it and it not in seen:
seen.add(it)
out.append(it)
return out
def _matches_extension(filename: str, exts: Optional[Sequence[str]]) -> bool:
if exts is None:
return True return True
return gitignore_spec.match_file(path) lname = filename.lower()
for ext in exts:
if lname.endswith(f".{ext}"):
return True
return False
def structure_directory_content(input_dir, output_file=None, extensions=None):
def _nontext_ratio(chunk: bytes) -> float:
""" """
This function goes through the input directory recursively and structures Ratio of control characters (excluding common whitespace).
all the file contents into one output file based on the given extensions. Treats bytes 0x20..0xFF as text-like to avoid penalizing non-ASCII.
:param input_dir: The input directory to search for files. """
:param output_file: The output file where the content will be structured. if not chunk:
If None, 'data.txt' or 'data.<extension>' will be used. return 0.0
:param extensions: A list of file extensions to include. If None, all files are included. allowed = {7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100))
nontext = sum(1 for b in chunk if b not in allowed)
return nontext / len(chunk)
def _probe_text_encoding(path: str, blocksize: int = 8192, default: str = "utf-8") -> Tuple[bool, Optional[str]]:
"""
Heuristically determine if file is text-like and pick a reasonable encoding.
Strategy:
- Empty files -> text.
- NUL byte -> binary.
- BOMs for UTF-8/16/32 -> text with that encoding.
- Try UTF-8 decode -> if ok, text (utf-8).
- Fallback heuristic: control-char ratio < 0.30 -> text (encoding unknown).
"""
try:
with open(path, "rb") as f:
chunk = f.read(blocksize)
except (PermissionError, IsADirectoryError, FileNotFoundError, OSError):
return False, None
if not chunk:
return True, default
# Quick binary checks
if b"\x00" in chunk:
return False, None
# BOMs
if chunk.startswith(codecs.BOM_UTF8):
return True, "utf-8-sig"
if chunk.startswith(codecs.BOM_UTF16_LE) or chunk.startswith(codecs.BOM_UTF16_BE):
return True, "utf-16"
if chunk.startswith(codecs.BOM_UTF32_LE) or chunk.startswith(codecs.BOM_UTF32_BE):
return True, "utf-32"
# Try UTF-8
try:
chunk.decode("utf-8")
return True, "utf-8"
except UnicodeDecodeError:
pass
# Heuristic on control chars
if _nontext_ratio(chunk) < 0.30:
return True, None
return False, None
def _is_excluded(rel_path: str, patterns: Sequence[str]) -> bool:
"""
Returns True if rel_path should be excluded based on patterns.
Supports glob-style patterns via fnmatch as well as simple substring checks.
"""
if not patterns:
return False
for pat in patterns:
# Glob match
if fnmatch.fnmatch(rel_path, pat):
return True
# Substring match
if pat in rel_path:
return True
return False
def structure_directory_content(
input_dir: str,
output_file: Optional[str] = None,
extensions: Optional[Union[str, Iterable[str]]] = None,
exclude: Optional[Union[str, Iterable[str]]] = None,
*,
encoding: str = "utf-8",
include_binary: bool = False,
**kwargs,
) -> None:
"""
Walk input_dir, concatenate **text** file contents into output_file.
Args:
input_dir: Directory to scan.
output_file: Path to write collected contents. Defaults to "data.txt"
or "data.<ext>" if a single explicit extension is provided.
extensions: Comma/space-separated string, iterable, or "*" for all.
exclude: Comma/space-separated string or iterable of patterns (substr or glob).
encoding: Default encoding used to read files when detection is inconclusive.
include_binary: If True, include files even when they look binary (NOT recommended).
**kwargs: Accepted for forward-compatibility; ignored if unknown.
""" """
gitignore_spec = read_gitignore(input_dir) gitignore_spec = read_gitignore(input_dir)
if extensions: exts = _parse_extensions(extensions)
extensions = [ext.strip() for ext in extensions.split(",") if ext.strip() != ""] excl = _parse_exclude(exclude)
if not output_file and len(extensions) == 1:
output_file = f"data.{extensions[0]}"
else:
extensions = None
# Decide output filename
if not output_file: if not output_file:
output_file = "data.txt" if exts and len(exts) == 1:
output_file = f"data.{exts[0]}"
else:
output_file = "data.txt"
with open(output_file, "w") as outfile: with open(output_file, "w", encoding=encoding, errors="ignore") as outfile:
for root, dirs, files in os.walk(input_dir): for root, dirs, files in os.walk(input_dir):
# Filter files and directories using the enhanced ignore check # Prune ignored directories
files = [
f for f in files
if not should_ignore_path(os.path.join(root, str(f)), gitignore_spec)
]
# Filter directories in-place
dirs[:] = [ dirs[:] = [
d for d in dirs d for d in sorted(dirs)
if not should_ignore_path(os.path.join(root, str(d)), gitignore_spec) if not gitignore_spec.match_file(os.path.join(root, str(d)))
]
# Filter files via .gitignore + defaults
files = [
f for f in sorted(files)
if not gitignore_spec.match_file(os.path.join(root, str(f)))
] ]
for file in files: for file in files:
if extensions is None or any( if not _matches_extension(file, exts):
file.endswith(f".{ext}") for ext in extensions continue
):
file_path = os.path.join(root, file) file_path = os.path.join(root, file)
try: relative_path = os.path.relpath(file_path, input_dir)
with open(file_path, "r") as infile:
data = infile.read() if _is_excluded(relative_path, excl):
outfile.write( continue
f"# {os.path.relpath(file_path, input_dir)}\n"
) is_text, detected_enc = _probe_text_encoding(file_path)
outfile.write(data) if not is_text and not include_binary:
outfile.write("\n\n") continue
except UnicodeDecodeError:
continue enc_to_use = detected_enc or encoding
try:
with open(file_path, "r", encoding=enc_to_use, errors="ignore") as infile:
data = infile.read()
except (UnicodeDecodeError, PermissionError, IsADirectoryError, FileNotFoundError, OSError):
continue
outfile.write(f"# {relative_path}\n")
outfile.write(data)
outfile.write("\n\n")
def _build_arg_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
description="Concatenate repository TEXT files into a single text file (respects .gitignore and sensible defaults)."
)
p.add_argument(
"input_dir",
nargs="?",
default=".",
help="Directory to scan (default: current directory).",
)
p.add_argument(
"-o", "--output-file",
dest="output_file",
default=None,
help="Output file path (default: data.txt or data.<ext> if single extension given).",
)
p.add_argument(
"-e", "--extensions",
dest="extensions",
default=None,
help='Comma/space-separated list of extensions (e.g., "py,md"). Use "*" for all.',
)
p.add_argument(
"-x", "--exclude",
dest="exclude",
nargs="*",
default=None,
help="Space-separated exclude patterns (substr or glob). Commas within args are also supported.",
)
p.add_argument(
"--encoding",
dest="encoding",
default="utf-8",
help="Default encoding used to read files (fallback when detection is inconclusive).",
)
p.add_argument(
"--include-binary",
dest="include_binary",
action="store_true",
help="Include files even if they appear binary (NOT recommended).",
)
return p
def _from_cli(argv: Sequence[str]) -> None:
# Backwards-compatible interactive mode if no CLI args
if len(argv) == 1:
input_directory = input("directory path: ").strip() or "."
output_filename = input("output file name (optional): ").strip()
file_extensions = input('file extensions separated by commas or spaces (optional, "*" for all): ').strip()
exclude_patterns_input = input("exclude patterns separated by spaces or commas (optional): ").strip()
structure_directory_content(
input_dir=input_directory,
output_file=output_filename if output_filename else None,
extensions=file_extensions if file_extensions else None,
exclude=exclude_patterns_input if exclude_patterns_input else None,
)
return
parser = _build_arg_parser()
args = parser.parse_args(argv[1:])
# If exclude was provided as multiple space-separated args, join back into one string
# so it can also accept commas inside tokens transparently.
exclude_arg: Optional[Union[str, Iterable[str]]]
if args.exclude is None:
exclude_arg = None
elif len(args.exclude) == 1:
exclude_arg = args.exclude[0]
else:
exclude_arg = args.exclude # already a list; parser will handle both
structure_directory_content(
input_dir=args.input_dir,
output_file=args.output_file,
extensions=args.extensions,
exclude=exclude_arg,
encoding=args.encoding,
include_binary=args.include_binary,
)
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) == 1: _from_cli(sys.argv)
input_directory = input("directory path: ")
output_filename = input("output file name (optional): ")
file_extensions = input("file extensions separated by commas (optional): ")
structure_directory_content(
input_directory,
output_filename if output_filename else None,
file_extensions if file_extensions else None,
)
else:
input_directory = sys.argv[1] if len(sys.argv) > 1 else "."
output_filename = sys.argv[2] if len(sys.argv) > 2 else None
file_extensions = sys.argv[3] if len(sys.argv) > 3 else None
structure_directory_content(input_directory, output_filename, file_extensions)