mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-10 15:14:56 +03:00
walk directories
This commit is contained in:
parent
e84b295279
commit
fd7e299967
|
@ -404,7 +404,10 @@ def git_checkout(
|
||||||
if not is_subpath_of(tmp_dir, source_path):
|
if not is_subpath_of(tmp_dir, source_path):
|
||||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
err = f"'{subpath}' is a path outside of the cloned repository."
|
||||||
msg.fail(err, repo, exits=1)
|
msg.fail(err, repo, exits=1)
|
||||||
|
if source_path.is_dir():
|
||||||
shutil.copytree(str(source_path), str(dest))
|
shutil.copytree(str(source_path), str(dest))
|
||||||
|
else:
|
||||||
|
shutil.copy(str(source_path), str(dest))
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
|
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
|
||||||
msg.fail(err, repo, exits=1)
|
msg.fail(err, repo, exits=1)
|
||||||
|
@ -573,3 +576,26 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
||||||
local_msg.info("Using CPU")
|
local_msg.info("Using CPU")
|
||||||
if gpu_is_available():
|
if gpu_is_available():
|
||||||
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||||
|
|
||||||
|
|
||||||
|
def walk_directory(path: Path, suffix: str) -> List[Path]:
|
||||||
|
if not path.is_dir():
|
||||||
|
return [path]
|
||||||
|
paths = [path]
|
||||||
|
locs = []
|
||||||
|
seen = set()
|
||||||
|
for path in paths:
|
||||||
|
if str(path) in seen:
|
||||||
|
continue
|
||||||
|
seen.add(str(path))
|
||||||
|
if path.parts[-1].startswith("."):
|
||||||
|
continue
|
||||||
|
elif path.is_dir():
|
||||||
|
paths.extend(path.iterdir())
|
||||||
|
elif not path.parts[-1].endswith(suffix):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
locs.append(path)
|
||||||
|
# It's good to sort these, in case the ordering messes up cache.
|
||||||
|
locs.sort()
|
||||||
|
return locs
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
import tqdm
|
import tqdm
|
||||||
import sys
|
from itertools import chain
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, setup_gpu, import_code
|
|
||||||
from typing import Optional, Generator, Union
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional, Generator, Union
|
||||||
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
|
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
||||||
|
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from .. import util
|
from ..util import ensure_path, load_model
|
||||||
|
|
||||||
|
|
||||||
path_help = ("Location of the documents to predict on."
|
path_help = ("Location of the documents to predict on."
|
||||||
|
@ -23,52 +24,46 @@ code_help = ("Path to Python file with additional "
|
||||||
gold_help = "Use gold preprocessing provided in the .spacy files"
|
gold_help = "Use gold preprocessing provided in the .spacy files"
|
||||||
|
|
||||||
|
|
||||||
def _stream_data(
|
def _stream_file(path: Path, vocab: Vocab) -> Generator[Union[Doc, str], None, None]:
|
||||||
data_path: Path,
|
|
||||||
vocab: Vocab,
|
|
||||||
suffix: Optional[str] = None
|
|
||||||
) -> Generator[Union[str, Doc], None, None]:
|
|
||||||
"""
|
"""
|
||||||
Load data which is either in a single file
|
Stream data from a single file. If the path points to
|
||||||
in .spacy or plain text format or multiple
|
a .spacy file then yield from the DocBin otherwise
|
||||||
text files in a directory. If a directory
|
yield each line of a text file. If a decoding error
|
||||||
is provided skip subdirectories and undecodeable
|
is encountered during reading the file exit.
|
||||||
files.
|
|
||||||
"""
|
"""
|
||||||
if not data_path.is_dir():
|
if not path.is_dir():
|
||||||
# Yield from DocBin.
|
# Yield from DocBin.
|
||||||
if data_path.suffix == ".spacy":
|
if path.suffix == ".spacy":
|
||||||
docbin = DocBin().from_disk(data_path)
|
docbin = DocBin().from_disk(path)
|
||||||
for doc in docbin.get_docs(vocab):
|
for doc in docbin.get_docs(vocab):
|
||||||
yield doc
|
yield doc
|
||||||
# Yield from text file
|
# Yield from text file
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
with open(data_path, 'r') as fin:
|
with open(path, 'r') as fin:
|
||||||
for line in fin:
|
for line in fin:
|
||||||
yield line
|
yield line
|
||||||
except UnicodeDecodeError as e:
|
except UnicodeDecodeError as e:
|
||||||
print(e)
|
print(e)
|
||||||
msg.warn(
|
msg.warn(
|
||||||
f"{data_path} could not be decoded.",
|
f"{path} could not be decoded.",
|
||||||
exits=True
|
exits=True
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
# Yield per one file in directory
|
|
||||||
for path in data_path.iterdir():
|
def _maybe_read(path: Path) -> Union[str, None]:
|
||||||
if path.is_dir():
|
"""
|
||||||
msg.warn(f"Skipping directory {path}")
|
Try to read the text file from the provided path.
|
||||||
elif suffix is not None and path.suffix != suffix:
|
When encoutering a decoding error just warn and pass.
|
||||||
print(suffix, path.suffix)
|
"""
|
||||||
msg.warn(f"Skipping file {path}")
|
|
||||||
else:
|
|
||||||
with open(path, 'r') as fin:
|
with open(path, 'r') as fin:
|
||||||
try:
|
try:
|
||||||
text = fin.read()
|
text = fin.read()
|
||||||
yield text
|
return text
|
||||||
except UnicodeDecodeError as e:
|
except UnicodeDecodeError as e:
|
||||||
msg.warn(f"Skipping file {path}")
|
msg.warn(f"Skipping file {path}")
|
||||||
print(e)
|
print(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
@app.command("apply")
|
@app.command("apply")
|
||||||
|
@ -78,20 +73,27 @@ def apply_cli(
|
||||||
data_path: Path = Arg(..., help=path_help, exists=True),
|
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||||
output: Path = Arg(..., help=out_help, dir_okay=False),
|
output: Path = Arg(..., help=out_help, dir_okay=False),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||||
use_gpu: Optional[int] = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
||||||
batch_size: Optional[int] = Opt(1, "--batch-size", "-b", help="Batch size."),
|
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
|
||||||
n_process: Optional[int] = Opt(1, "--n-process", "-n", help="number of processors to use."),
|
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use."),
|
||||||
suffix: Optional[str] = Opt(None, "--suffix", "-n", help="Only read files with file.suffix.")
|
suffix: str = Opt("", "--suffix", "-n", help="Only read files with file.suffix.")
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Apply a trained pipeline to documents to get predictions.
|
Apply a trained pipeline to documents to get predictions.
|
||||||
Expects a loadable spaCy pipeline and some data as input.
|
Expects a loadable spaCy pipeline and some data as input.
|
||||||
The input can be provided multiple formats. It can be a .spacy
|
The data can be provided multiple formats. It can be a single
|
||||||
file, a single text file with one document per line or a directory
|
.spacy file or a single text file with one document per line.
|
||||||
where each file is assumed to be plain text document.
|
A directory can also be provided in which case the 'suffix'
|
||||||
|
argument is required. All paths pointing to files with the
|
||||||
|
provided suffix will be recursively collected and processed.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#tba
|
DOCS: https://spacy.io/api/cli#tba
|
||||||
"""
|
"""
|
||||||
|
if data_path.is_dir() and suffix == "":
|
||||||
|
raise ValueError(
|
||||||
|
"When the provided 'data_path' is a directory "
|
||||||
|
"the --suffix argument has to be provided as well."
|
||||||
|
)
|
||||||
if suffix is not None:
|
if suffix is not None:
|
||||||
if not suffix.startswith("."):
|
if not suffix.startswith("."):
|
||||||
suffix = "." + suffix
|
suffix = "." + suffix
|
||||||
|
@ -106,17 +108,29 @@ def apply(
|
||||||
model: str,
|
model: str,
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
n_process: int,
|
n_process: int,
|
||||||
suffix: Optional[str]
|
suffix: str
|
||||||
):
|
):
|
||||||
data_path = util.ensure_path(data_path)
|
data_path = ensure_path(data_path)
|
||||||
output_path = util.ensure_path(output)
|
output_path = ensure_path(output)
|
||||||
if not data_path.exists():
|
if not data_path.exists():
|
||||||
msg.fail("Couldn't find data path.", data_path, exits=1)
|
msg.fail("Couldn't find data path.", data_path, exits=1)
|
||||||
nlp = util.load_model(model)
|
nlp = load_model(model)
|
||||||
msg.good(f"Loaded model {model}")
|
msg.good(f"Loaded model {model}")
|
||||||
vocab = nlp.vocab
|
vocab = nlp.vocab
|
||||||
docbin = DocBin()
|
docbin = DocBin()
|
||||||
datagen = _stream_data(data_path, vocab, suffix)
|
datagen: Union[
|
||||||
|
Generator[Union[Doc, str], None, None],
|
||||||
|
chain[Union[Doc, str]],
|
||||||
|
filter[str]
|
||||||
|
]
|
||||||
|
if not data_path.is_dir():
|
||||||
|
datagen = _stream_file(data_path, vocab)
|
||||||
|
else:
|
||||||
|
paths = walk_directory(data_path, suffix)
|
||||||
|
if suffix == ".spacy":
|
||||||
|
datagen = chain(*[_stream_file(path, vocab) for path in paths])
|
||||||
|
else:
|
||||||
|
datagen = filter(None, (_maybe_read(path) for path in paths))
|
||||||
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
|
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
|
||||||
docbin.add(doc)
|
docbin.add(doc)
|
||||||
if output_path.is_dir():
|
if output_path.is_dir():
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
@ -7,7 +7,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt, walk_directory
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
|
@ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def walk_directory(path: Path, converter: str) -> List[Path]:
|
|
||||||
if not path.is_dir():
|
|
||||||
return [path]
|
|
||||||
paths = [path]
|
|
||||||
locs = []
|
|
||||||
seen = set()
|
|
||||||
for path in paths:
|
|
||||||
if str(path) in seen:
|
|
||||||
continue
|
|
||||||
seen.add(str(path))
|
|
||||||
if path.parts[-1].startswith("."):
|
|
||||||
continue
|
|
||||||
elif path.is_dir():
|
|
||||||
paths.extend(path.iterdir())
|
|
||||||
elif converter == "json" and not path.parts[-1].endswith("json"):
|
|
||||||
continue
|
|
||||||
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
|
||||||
continue
|
|
||||||
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
locs.append(path)
|
|
||||||
# It's good to sort these, in case the ordering messes up cache.
|
|
||||||
locs.sort()
|
|
||||||
return locs
|
|
||||||
|
|
||||||
|
|
||||||
def verify_cli_args(
|
def verify_cli_args(
|
||||||
msg: Printer,
|
msg: Printer,
|
||||||
input_path: Path,
|
input_path: Path,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user