updated extract_folder script

This commit is contained in:
Alexander Karpov 2025-11-28 16:04:30 +03:00
parent 06cd5b7bea
commit 61dd049637

View File

@ -1,98 +1,473 @@
#!/usr/bin/env python3
"""
Dump selected repository files into a single text file, honoring .gitignore,
sensible default ignore patterns, and extracting **only text-readable files**
(no binary blobs).
Whats new:
- Much larger default ignore list (build artifacts, caches, lockfiles, etc.).
- Accept kwargs (safe to pass extra keyword arguments).
- Extensions accept "*" to mean "all".
- Excludes can be provided as a space-separated list (CLI) or any of: string with
commas/whitespace, list/tuple; glob patterns supported (fnmatch).
- **Binary detection**: only text-like files are included (BOM/UTF-8 checks + heuristics).
You can override with `--include-binary` if ever needed.
Examples:
# Everything (respecting gitignore + defaults), write to data.txt
python script.py .
# Only py,md; space-separated excludes; auto output "data.py"
python script.py . -e py,md -x migrations tests docs
# All extensions ("*"), but still allow manual exclude patterns
python script.py . -e "*" -x "node_modules dist build"
# With keyword args from another Python module
from script import structure_directory_content
structure_directory_content(
input_dir=".",
extensions="py,go",
exclude=["migrations", "node_modules"],
output_file="repo_dump.txt",
)
"""
import argparse
import codecs
import fnmatch
import os
import sys
from typing import Iterable, List, Optional, Sequence, Tuple, Union
import pathspec
def read_gitignore(input_dir):
def _default_garbage_patterns() -> List[str]:
"""
Reads the .gitignore file in the input directory and returns a pathspec object.
A broad default set of ignore patterns using gitwildmatch semantics.
These complement (not replace) patterns found in .gitignore.
"""
# Default patterns to always ignore
default_patterns = [
"migrations/*.py", # Ignore Django migration files
"node_modules/", # Ignore node_modules directory
return [
# VCS
"**/.git/**",
"**/.hg/**",
"**/.svn/**",
# OS cruft
"**/.DS_Store",
"**/Thumbs.db",
# Node / JS
"**/node_modules/**",
"**/package-lock.json",
"**/yarn.lock",
"**/pnpm-lock.yaml",
"**/bun.lockb",
"**/.yarn/**",
"**/.pnpm-store/**",
"**/.parcel-cache/**",
"**/.next/**",
"**/.nuxt/**",
"**/jspm_packages/**",
"**/bower_components/**",
# Python
"**/__pycache__/**",
"**/*.pyc",
"**/*.pyo",
"**/.pytest_cache/**",
"**/.mypy_cache/**",
"**/.ruff_cache/**",
"**/.tox/**",
"**/.coverage*",
"**/coverage.xml",
"**/.cache/**",
"**/*.egg-info/**",
"**/.venv/**",
"uv.lock",
"**/venv/**",
"**/env/**",
"**/.ipynb_checkpoints/**",
"poetry.lock",
"Pipfile.lock",
# Go
"go.work.sum",
"go.sum",
"*_minimock.go",
# Java / Kotlin / Gradle
"**/.gradle/**",
"**/build/**",
"**/target/**",
"**/out/**",
# General IDE / project files
"**/.idea/**",
"**/.vscode/**",
".next/",
".idea/",
".gradle/",
# Web frameworks / uploads
"static/uploads/*",
# Terraform / Serverless
"**/.terraform/**",
"**/terraform.tfstate*",
"**/.serverless/**",
# Compiled / binaries / artifacts
"**/*.class",
"**/*.o",
"**/*.a",
"**/*.so",
"**/*.dll",
"**/*.exe",
"**/*.dylib",
"**/*.dSYM/**",
"**/*.log",
# Misc build outputs
"**/dist/**",
"**/.angular/**",
"**/.svelte-kit/**",
# Migrations (from original)
"migrations/*.py",
"**/migrations/*.py",
# Locks from original (explicit)
"package-lock.json",
]
def read_gitignore(input_dir: str, extra_patterns: Optional[Sequence[str]] = None) -> pathspec.PathSpec:
"""
Build a PathSpec from .gitignore (if present) + default garbage patterns.
"""
default_patterns = _default_garbage_patterns()
if extra_patterns:
default_patterns.extend(extra_patterns)
# Deduplicate while preserving order
seen = set()
merged_defaults = []
for p in default_patterns:
if p not in seen:
seen.add(p)
merged_defaults.append(p)
gitignore_path = os.path.join(input_dir, ".gitignore")
if os.path.isfile(gitignore_path):
with open(gitignore_path, "r") as gitignore_file:
patterns = default_patterns + list(gitignore_file)
with open(gitignore_path, "r", encoding="utf-8", errors="ignore") as gitignore_file:
patterns = merged_defaults + list(gitignore_file)
return pathspec.PathSpec.from_lines("gitwildmatch", patterns)
else:
return pathspec.PathSpec.from_lines("gitwildmatch", default_patterns)
return pathspec.PathSpec.from_lines("gitwildmatch", merged_defaults)
def should_ignore_path(path, gitignore_spec):
def _parse_extensions(extensions: Optional[Union[str, Iterable[str]]]) -> Optional[List[str]]:
"""
Additional check for paths that should be ignored.
Normalize extensions.
Returns:
None -> accept all
["py", "md", ...] -> accept files ending in these extensions (case-insensitive)
"""
# Check if the path contains node_modules or migrations directory
if "node_modules" in path.split(os.sep) or \
("migrations" in path.split(os.sep) and path.endswith(".py")):
if extensions is None:
return None
# If user passed list/tuple/set
if isinstance(extensions, (list, tuple, set)):
items = list(extensions)
else:
# Accept both comma- and whitespace-separated
raw = extensions.strip()
if raw == "" or raw == "*":
return None
# split on commas or whitespace
parts = [p for chunk in raw.split(",") for p in chunk.split()]
items = [p for p in parts if p]
# If any item is "*" -> all
if any(it == "*" for it in items):
return None
# Normalize: strip leading dots and lower
norm = [it.lower().lstrip(".") for it in items if it.strip() != ""]
return norm or None
def _parse_exclude(exclude: Optional[Union[str, Iterable[str]]]) -> List[str]:
"""
Normalize exclude patterns. Supports:
- string with commas and/or whitespace separators
- list/tuple/set of strings
Patterns can be substrings or globs (fnmatch).
"""
if exclude is None:
return []
if isinstance(exclude, (list, tuple, set)):
items = [str(x) for x in exclude]
else:
raw = exclude.strip()
if not raw:
return []
# split on commas first, then on whitespace inside each token
items = [p for chunk in raw.split(",") for p in chunk.split()]
# remove empties, dedupe preserve order
seen = set()
out: List[str] = []
for it in items:
if it and it not in seen:
seen.add(it)
out.append(it)
return out
def _matches_extension(filename: str, exts: Optional[Sequence[str]]) -> bool:
if exts is None:
return True
return gitignore_spec.match_file(path)
lname = filename.lower()
for ext in exts:
if lname.endswith(f".{ext}"):
return True
return False
def structure_directory_content(input_dir, output_file=None, extensions=None):
def _nontext_ratio(chunk: bytes) -> float:
"""
This function goes through the input directory recursively and structures
all the file contents into one output file based on the given extensions.
:param input_dir: The input directory to search for files.
:param output_file: The output file where the content will be structured.
If None, 'data.txt' or 'data.<extension>' will be used.
:param extensions: A list of file extensions to include. If None, all files are included.
Ratio of control characters (excluding common whitespace).
Treats bytes 0x20..0xFF as text-like to avoid penalizing non-ASCII.
"""
if not chunk:
return 0.0
allowed = {7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100))
nontext = sum(1 for b in chunk if b not in allowed)
return nontext / len(chunk)
def _probe_text_encoding(path: str, blocksize: int = 8192, default: str = "utf-8") -> Tuple[bool, Optional[str]]:
"""
Heuristically determine if file is text-like and pick a reasonable encoding.
Strategy:
- Empty files -> text.
- NUL byte -> binary.
- BOMs for UTF-8/16/32 -> text with that encoding.
- Try UTF-8 decode -> if ok, text (utf-8).
- Fallback heuristic: control-char ratio < 0.30 -> text (encoding unknown).
"""
try:
with open(path, "rb") as f:
chunk = f.read(blocksize)
except (PermissionError, IsADirectoryError, FileNotFoundError, OSError):
return False, None
if not chunk:
return True, default
# Quick binary checks
if b"\x00" in chunk:
return False, None
# BOMs
if chunk.startswith(codecs.BOM_UTF8):
return True, "utf-8-sig"
if chunk.startswith(codecs.BOM_UTF16_LE) or chunk.startswith(codecs.BOM_UTF16_BE):
return True, "utf-16"
if chunk.startswith(codecs.BOM_UTF32_LE) or chunk.startswith(codecs.BOM_UTF32_BE):
return True, "utf-32"
# Try UTF-8
try:
chunk.decode("utf-8")
return True, "utf-8"
except UnicodeDecodeError:
pass
# Heuristic on control chars
if _nontext_ratio(chunk) < 0.30:
return True, None
return False, None
def _is_excluded(rel_path: str, patterns: Sequence[str]) -> bool:
"""
Returns True if rel_path should be excluded based on patterns.
Supports glob-style patterns via fnmatch as well as simple substring checks.
"""
if not patterns:
return False
for pat in patterns:
# Glob match
if fnmatch.fnmatch(rel_path, pat):
return True
# Substring match
if pat in rel_path:
return True
return False
def structure_directory_content(
input_dir: str,
output_file: Optional[str] = None,
extensions: Optional[Union[str, Iterable[str]]] = None,
exclude: Optional[Union[str, Iterable[str]]] = None,
*,
encoding: str = "utf-8",
include_binary: bool = False,
**kwargs,
) -> None:
"""
Walk input_dir, concatenate **text** file contents into output_file.
Args:
input_dir: Directory to scan.
output_file: Path to write collected contents. Defaults to "data.txt"
or "data.<ext>" if a single explicit extension is provided.
extensions: Comma/space-separated string, iterable, or "*" for all.
exclude: Comma/space-separated string or iterable of patterns (substr or glob).
encoding: Default encoding used to read files when detection is inconclusive.
include_binary: If True, include files even when they look binary (NOT recommended).
**kwargs: Accepted for forward-compatibility; ignored if unknown.
"""
gitignore_spec = read_gitignore(input_dir)
if extensions:
extensions = [ext.strip() for ext in extensions.split(",") if ext.strip() != ""]
if not output_file and len(extensions) == 1:
output_file = f"data.{extensions[0]}"
else:
extensions = None
exts = _parse_extensions(extensions)
excl = _parse_exclude(exclude)
# Decide output filename
if not output_file:
output_file = "data.txt"
if exts and len(exts) == 1:
output_file = f"data.{exts[0]}"
else:
output_file = "data.txt"
with open(output_file, "w") as outfile:
with open(output_file, "w", encoding=encoding, errors="ignore") as outfile:
for root, dirs, files in os.walk(input_dir):
# Filter files and directories using the enhanced ignore check
files = [
f for f in files
if not should_ignore_path(os.path.join(root, str(f)), gitignore_spec)
]
# Filter directories in-place
# Prune ignored directories
dirs[:] = [
d for d in dirs
if not should_ignore_path(os.path.join(root, str(d)), gitignore_spec)
d for d in sorted(dirs)
if not gitignore_spec.match_file(os.path.join(root, str(d)))
]
# Filter files via .gitignore + defaults
files = [
f for f in sorted(files)
if not gitignore_spec.match_file(os.path.join(root, str(f)))
]
for file in files:
if extensions is None or any(
file.endswith(f".{ext}") for ext in extensions
):
file_path = os.path.join(root, file)
try:
with open(file_path, "r") as infile:
data = infile.read()
outfile.write(
f"# {os.path.relpath(file_path, input_dir)}\n"
)
outfile.write(data)
outfile.write("\n\n")
except UnicodeDecodeError:
continue
if not _matches_extension(file, exts):
continue
file_path = os.path.join(root, file)
relative_path = os.path.relpath(file_path, input_dir)
if _is_excluded(relative_path, excl):
continue
is_text, detected_enc = _probe_text_encoding(file_path)
if not is_text and not include_binary:
continue
enc_to_use = detected_enc or encoding
try:
with open(file_path, "r", encoding=enc_to_use, errors="ignore") as infile:
data = infile.read()
except (UnicodeDecodeError, PermissionError, IsADirectoryError, FileNotFoundError, OSError):
continue
outfile.write(f"# {relative_path}\n")
outfile.write(data)
outfile.write("\n\n")
def _build_arg_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
description="Concatenate repository TEXT files into a single text file (respects .gitignore and sensible defaults)."
)
p.add_argument(
"input_dir",
nargs="?",
default=".",
help="Directory to scan (default: current directory).",
)
p.add_argument(
"-o", "--output-file",
dest="output_file",
default=None,
help="Output file path (default: data.txt or data.<ext> if single extension given).",
)
p.add_argument(
"-e", "--extensions",
dest="extensions",
default=None,
help='Comma/space-separated list of extensions (e.g., "py,md"). Use "*" for all.',
)
p.add_argument(
"-x", "--exclude",
dest="exclude",
nargs="*",
default=None,
help="Space-separated exclude patterns (substr or glob). Commas within args are also supported.",
)
p.add_argument(
"--encoding",
dest="encoding",
default="utf-8",
help="Default encoding used to read files (fallback when detection is inconclusive).",
)
p.add_argument(
"--include-binary",
dest="include_binary",
action="store_true",
help="Include files even if they appear binary (NOT recommended).",
)
return p
def _from_cli(argv: Sequence[str]) -> None:
# Backwards-compatible interactive mode if no CLI args
if len(argv) == 1:
input_directory = input("directory path: ").strip() or "."
output_filename = input("output file name (optional): ").strip()
file_extensions = input('file extensions separated by commas or spaces (optional, "*" for all): ').strip()
exclude_patterns_input = input("exclude patterns separated by spaces or commas (optional): ").strip()
structure_directory_content(
input_dir=input_directory,
output_file=output_filename if output_filename else None,
extensions=file_extensions if file_extensions else None,
exclude=exclude_patterns_input if exclude_patterns_input else None,
)
return
parser = _build_arg_parser()
args = parser.parse_args(argv[1:])
# If exclude was provided as multiple space-separated args, join back into one string
# so it can also accept commas inside tokens transparently.
exclude_arg: Optional[Union[str, Iterable[str]]]
if args.exclude is None:
exclude_arg = None
elif len(args.exclude) == 1:
exclude_arg = args.exclude[0]
else:
exclude_arg = args.exclude # already a list; parser will handle both
structure_directory_content(
input_dir=args.input_dir,
output_file=args.output_file,
extensions=args.extensions,
exclude=exclude_arg,
encoding=args.encoding,
include_binary=args.include_binary,
)
if __name__ == "__main__":
if len(sys.argv) == 1:
input_directory = input("directory path: ")
output_filename = input("output file name (optional): ")
file_extensions = input("file extensions separated by commas (optional): ")
structure_directory_content(
input_directory,
output_filename if output_filename else None,
file_extensions if file_extensions else None,
)
else:
input_directory = sys.argv[1] if len(sys.argv) > 1 else "."
output_filename = sys.argv[2] if len(sys.argv) > 2 else None
file_extensions = sys.argv[3] if len(sys.argv) > 3 else None
structure_directory_content(input_directory, output_filename, file_extensions)
_from_cli(sys.argv)