mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
Merge branch 'develop' into refactor/vector-names
This commit is contained in:
commit
191fb4144f
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a11"
|
__version__ = "3.0.0a12"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -195,7 +195,7 @@ def get_checksum(path: Union[Path, str]) -> str:
|
||||||
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
||||||
dir_checksum.update(sub_file.read_bytes())
|
dir_checksum.update(sub_file.read_bytes())
|
||||||
return dir_checksum.hexdigest()
|
return dir_checksum.hexdigest()
|
||||||
raise ValueError(f"Can't get checksum for {path}: not a file or directory")
|
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
@ -320,23 +320,43 @@ def git_sparse_checkout(
|
||||||
repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None
|
repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None
|
||||||
):
|
):
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
raise IOError("Destination of checkout must not exist")
|
msg.fail("Destination of checkout must not exist", exits=1)
|
||||||
if not dest.parent.exists():
|
if not dest.parent.exists():
|
||||||
raise IOError("Parent of destination of checkout must exist")
|
raise IOError("Parent of destination of checkout must exist")
|
||||||
|
# We're using Git, partial clone and sparse checkout to
|
||||||
|
# only clone the files we need
|
||||||
|
# This ends up being RIDICULOUS. omg.
|
||||||
|
# So, every tutorial and SO post talks about 'sparse checkout'...But they
|
||||||
|
# go and *clone* the whole repo. Worthless. And cloning part of a repo
|
||||||
|
# turns out to be completely broken. The only way to specify a "path" is..
|
||||||
|
# a path *on the server*? The contents of which, specifies the paths. Wat.
|
||||||
|
# Obviously this is hopelessly broken and insecure, because you can query
|
||||||
|
# arbitrary paths on the server! So nobody enables this.
|
||||||
|
# What we have to do is disable *all* files. We could then just checkout
|
||||||
|
# the path, and it'd "work", but be hopelessly slow...Because it goes and
|
||||||
|
# transfers every missing object one-by-one. So the final piece is that we
|
||||||
|
# need to use some weird git internals to fetch the missings in bulk, and
|
||||||
|
# *that* we can do by path.
|
||||||
# We're using Git and sparse checkout to only clone the files we need
|
# We're using Git and sparse checkout to only clone the files we need
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
|
# This is the "clone, but don't download anything" part.
|
||||||
cmd = (
|
cmd = (
|
||||||
f"git clone {repo} {tmp_dir} --no-checkout "
|
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||||
"--depth 1 --config core.sparseCheckout=true"
|
"--filter=blob:none" # <-- The key bit
|
||||||
)
|
)
|
||||||
if branch is not None:
|
if branch is not None:
|
||||||
cmd = f"{cmd} -b {branch}"
|
cmd = f"{cmd} -b {branch}"
|
||||||
run_command(cmd)
|
run_command(cmd, capture=True)
|
||||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
# Now we need to find the missing filenames for the subpath we want.
|
||||||
f.write(subpath)
|
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
ret = run_command(cmd, capture=True)
|
||||||
|
missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||||
|
# Now pass those missings into another bit of git internals
|
||||||
|
run_command(
|
||||||
|
f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings
|
||||||
|
)
|
||||||
|
# And finally, we can checkout our subpath
|
||||||
|
run_command(f"git -C {tmp_dir} checkout {branch} {subpath}")
|
||||||
# We need Path(name) to make sure we also support subdirectories
|
# We need Path(name) to make sure we also support subdirectories
|
||||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||||
print(dest)
|
|
||||||
print(list(dest.iterdir()))
|
|
||||||
|
|
|
@ -77,7 +77,9 @@ def package(
|
||||||
meta = generate_meta(meta, msg)
|
meta = generate_meta(meta, msg)
|
||||||
errors = validate(ModelMetaSchema, meta)
|
errors = validate(ModelMetaSchema, meta)
|
||||||
if errors:
|
if errors:
|
||||||
msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
|
msg.fail("Invalid model meta.json")
|
||||||
|
print("\n".join(errors))
|
||||||
|
sys.exit(1)
|
||||||
model_name = meta["lang"] + "_" + meta["name"]
|
model_name = meta["lang"] + "_" + meta["name"]
|
||||||
model_name_v = model_name + "-" + meta["version"]
|
model_name_v = model_name + "-" + meta["version"]
|
||||||
main_path = output_dir / model_name_v
|
main_path = output_dir / model_name_v
|
||||||
|
@ -118,10 +120,10 @@ def get_meta(
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
"name": "model",
|
"name": "model",
|
||||||
"version": "0.0.0",
|
"version": "0.0.0",
|
||||||
"description": None,
|
"description": "",
|
||||||
"author": None,
|
"author": "",
|
||||||
"email": None,
|
"email": "",
|
||||||
"url": None,
|
"url": "",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
}
|
}
|
||||||
meta.update(existing_meta)
|
meta.update(existing_meta)
|
||||||
|
|
|
@ -14,7 +14,7 @@ def project_pull_cli(
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Retrieve any precomputed outputs from a remote storage that are available.
|
"""Retrieve available precomputed outputs from a remote storage.
|
||||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
You can alias remotes in your project.yml by mapping them to storage paths.
|
||||||
A storage can be anything that the smart-open library can upload to, e.g.
|
A storage can be anything that the smart-open library can upload to, e.g.
|
||||||
gcs, aws, ssh, local directories etc
|
gcs, aws, ssh, local directories etc
|
||||||
|
|
|
@ -63,7 +63,7 @@ class TokenPatternString(BaseModel):
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
|
||||||
@validator("*", pre=True, each_item=True)
|
@validator("*", pre=True, each_item=True, allow_reuse=True)
|
||||||
def raise_for_none(cls, v):
|
def raise_for_none(cls, v):
|
||||||
if v is None:
|
if v is None:
|
||||||
raise ValueError("None / null is not allowed")
|
raise ValueError("None / null is not allowed")
|
||||||
|
@ -84,7 +84,7 @@ class TokenPatternNumber(BaseModel):
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
|
||||||
@validator("*", pre=True, each_item=True)
|
@validator("*", pre=True, each_item=True, allow_reuse=True)
|
||||||
def raise_for_none(cls, v):
|
def raise_for_none(cls, v):
|
||||||
if v is None:
|
if v is None:
|
||||||
raise ValueError("None / null is not allowed")
|
raise ValueError("None / null is not allowed")
|
||||||
|
@ -145,7 +145,7 @@ class TokenPattern(BaseModel):
|
||||||
allow_population_by_field_name = True
|
allow_population_by_field_name = True
|
||||||
alias_generator = lambda value: value.upper()
|
alias_generator = lambda value: value.upper()
|
||||||
|
|
||||||
@validator("*", pre=True)
|
@validator("*", pre=True, allow_reuse=True)
|
||||||
def raise_for_none(cls, v):
|
def raise_for_none(cls, v):
|
||||||
if v is None:
|
if v is None:
|
||||||
raise ValueError("None / null is not allowed")
|
raise ValueError("None / null is not allowed")
|
||||||
|
@ -265,7 +265,7 @@ class ConfigSchema(BaseModel):
|
||||||
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
||||||
components: Dict[str, Dict[str, Any]]
|
components: Dict[str, Dict[str, Any]]
|
||||||
|
|
||||||
@root_validator
|
@root_validator(allow_reuse=True)
|
||||||
def validate_config(cls, values):
|
def validate_config(cls, values):
|
||||||
"""Perform additional validation for settings with dependencies."""
|
"""Perform additional validation for settings with dependencies."""
|
||||||
pt = values.get("pretraining")
|
pt = values.get("pretraining")
|
||||||
|
@ -283,7 +283,15 @@ class ConfigSchema(BaseModel):
|
||||||
# Project config Schema
|
# Project config Schema
|
||||||
|
|
||||||
|
|
||||||
class ProjectConfigAsset(BaseModel):
|
class ProjectConfigAssetGitItem(BaseModel):
|
||||||
|
# fmt: off
|
||||||
|
repo: StrictStr = Field(..., title="URL of Git repo to download from")
|
||||||
|
path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)")
|
||||||
|
branch: StrictStr = Field("master", title="Branch to clone from")
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
class ProjectConfigAssetURL(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||||
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||||
|
@ -291,6 +299,13 @@ class ProjectConfigAsset(BaseModel):
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
class ProjectConfigAssetGit(BaseModel):
|
||||||
|
# fmt: off
|
||||||
|
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
|
||||||
|
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
class ProjectConfigCommand(BaseModel):
|
class ProjectConfigCommand(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
name: StrictStr = Field(..., title="Name of command")
|
name: StrictStr = Field(..., title="Name of command")
|
||||||
|
@ -310,7 +325,7 @@ class ProjectConfigCommand(BaseModel):
|
||||||
class ProjectConfigSchema(BaseModel):
|
class ProjectConfigSchema(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
|
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
|
||||||
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
|
||||||
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
|
@ -30,11 +30,11 @@ class PRFScore:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def precision(self) -> float:
|
def precision(self) -> float:
|
||||||
return (self.tp / (self.tp + self.fp + 1e-100))
|
return self.tp / (self.tp + self.fp + 1e-100)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def recall(self) -> float:
|
def recall(self) -> float:
|
||||||
return (self.tp / (self.tp + self.fn + 1e-100))
|
return self.tp / (self.tp + self.fn + 1e-100)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def fscore(self) -> float:
|
def fscore(self) -> float:
|
||||||
|
|
|
@ -472,7 +472,6 @@ def sort_nums(x):
|
||||||
return x[1]
|
return x[1]
|
||||||
|
|
||||||
|
|
||||||
PRON_LEMMA = "-PRON-"
|
|
||||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||||
# Unfortunate hack here, to work around problem with long cpdef enum
|
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||||
|
|
|
@ -270,6 +270,41 @@ def test_pretrain_make_docs():
|
||||||
assert skip_count == 0
|
assert skip_count == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_project_config_validation_full():
|
||||||
|
config = {
|
||||||
|
"vars": {"some_var": 20},
|
||||||
|
"directories": ["assets", "configs", "corpus", "scripts", "training"],
|
||||||
|
"assets": [
|
||||||
|
{
|
||||||
|
"dest": "x",
|
||||||
|
"url": "https://example.com",
|
||||||
|
"checksum": "63373dd656daa1fd3043ce166a59474c",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dest": "y",
|
||||||
|
"git": {
|
||||||
|
"repo": "https://github.com/example/repo",
|
||||||
|
"branch": "develop",
|
||||||
|
"path": "y",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"commands": [
|
||||||
|
{
|
||||||
|
"name": "train",
|
||||||
|
"help": "Train a model",
|
||||||
|
"script": ["python -m spacy train config.cfg -o training"],
|
||||||
|
"deps": ["config.cfg", "corpus/training.spcy"],
|
||||||
|
"outputs": ["training/model-best"],
|
||||||
|
},
|
||||||
|
{"name": "test", "script": ["pytest", "custom.py"], "no_skip": True},
|
||||||
|
],
|
||||||
|
"workflows": {"all": ["train", "test"], "train": ["train"]},
|
||||||
|
}
|
||||||
|
errors = validate(ProjectConfigSchema, config)
|
||||||
|
assert not errors
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"config",
|
"config",
|
||||||
[
|
[
|
||||||
|
|
|
@ -572,7 +572,7 @@ def join_command(command: List[str]) -> str:
|
||||||
return " ".join(shlex.quote(cmd) for cmd in command)
|
return " ".join(shlex.quote(cmd) for cmd in command)
|
||||||
|
|
||||||
|
|
||||||
def run_command(command: Union[str, List[str]]) -> None:
|
def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None:
|
||||||
"""Run a command on the command line as a subprocess. If the subprocess
|
"""Run a command on the command line as a subprocess. If the subprocess
|
||||||
returns a non-zero exit code, a system exit is performed.
|
returns a non-zero exit code, a system exit is performed.
|
||||||
|
|
||||||
|
@ -582,13 +582,22 @@ def run_command(command: Union[str, List[str]]) -> None:
|
||||||
if isinstance(command, str):
|
if isinstance(command, str):
|
||||||
command = split_command(command)
|
command = split_command(command)
|
||||||
try:
|
try:
|
||||||
status = subprocess.call(command, env=os.environ.copy())
|
ret = subprocess.run(
|
||||||
|
command,
|
||||||
|
env=os.environ.copy(),
|
||||||
|
input=stdin,
|
||||||
|
encoding="utf8",
|
||||||
|
check=True,
|
||||||
|
stdout=subprocess.PIPE if capture else None,
|
||||||
|
stderr=subprocess.PIPE if capture else None,
|
||||||
|
)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
raise FileNotFoundError(
|
raise FileNotFoundError(
|
||||||
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
||||||
) from None
|
) from None
|
||||||
if status != 0:
|
if ret.returncode != 0:
|
||||||
sys.exit(status)
|
sys.exit(ret.returncode)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
|
|
@ -660,8 +660,10 @@ for more info.
|
||||||
As of spaCy v3.0, the `pretrain` command takes the same
|
As of spaCy v3.0, the `pretrain` command takes the same
|
||||||
[config file](/usage/training#config) as the `train` command. This ensures that
|
[config file](/usage/training#config) as the `train` command. This ensures that
|
||||||
settings are consistent between pretraining and training. Settings for
|
settings are consistent between pretraining and training. Settings for
|
||||||
pretraining can be defined in the `[pretraining]` block of the config file. See
|
pretraining can be defined in the `[pretraining]` block of the config file and
|
||||||
the [data format](/api/data-formats#config) for details.
|
auto-generated by setting `--pretraining` on
|
||||||
|
[`init fill-config`](/api/cli#init-fill-config). Also see the
|
||||||
|
[data format](/api/data-formats#config) for details.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -375,7 +375,8 @@ The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the
|
||||||
"token-to-vector" embedding layer of pipeline components from raw text. Raw text
|
"token-to-vector" embedding layer of pipeline components from raw text. Raw text
|
||||||
can be provided as a `.jsonl` (newline-delimited JSON) file containing one input
|
can be provided as a `.jsonl` (newline-delimited JSON) file containing one input
|
||||||
text per line (roughly paragraph length is good). Optionally, custom
|
text per line (roughly paragraph length is good). Optionally, custom
|
||||||
tokenization can be provided.
|
tokenization can be provided. The JSONL format means that the texts can be read
|
||||||
|
in line-by-line, while still making it easy to represent newlines in the data.
|
||||||
|
|
||||||
> #### Tip: Writing JSONL
|
> #### Tip: Writing JSONL
|
||||||
>
|
>
|
||||||
|
|
|
@ -43,6 +43,8 @@ recognizer doesn't use any features set by the tagger and parser, and so on.
|
||||||
This means that you can swap them, or remove single components from the pipeline
|
This means that you can swap them, or remove single components from the pipeline
|
||||||
without affecting the others. However, components may share a "token-to-vector"
|
without affecting the others. However, components may share a "token-to-vector"
|
||||||
component like [`Tok2Vec`](/api/tok2vec) or [`Transformer`](/api/transformer).
|
component like [`Tok2Vec`](/api/tok2vec) or [`Transformer`](/api/transformer).
|
||||||
|
You can read more about this in the docs on
|
||||||
|
[embedding layers](/usage/embeddings-transformers#embedding-layers).
|
||||||
|
|
||||||
Custom components may also depend on annotations set by other components. For
|
Custom components may also depend on annotations set by other components. For
|
||||||
example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll
|
example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll
|
||||||
|
|
|
@ -107,7 +107,62 @@ transformer outputs to the
|
||||||
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
|
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
|
||||||
giving you access to them after the pipeline has finished running.
|
giving you access to them after the pipeline has finished running.
|
||||||
|
|
||||||
<!-- TODO: show example of implementation via config, side by side -->
|
### Example: Shared vs. independent config {#embedding-layers-config}
|
||||||
|
|
||||||
|
The [config system](/usage/training#config) lets you express model configuration
|
||||||
|
for both shared and independent embedding layers. The shared setup uses a single
|
||||||
|
[`Tok2Vec`](/api/tok2vec) component with the
|
||||||
|
[Tok2Vec](/api/architectures#Tok2Vec) architecture. All other components, like
|
||||||
|
the entity recognizer, use a
|
||||||
|
[Tok2VecListener](/api/architectures#Tok2VecListener) layer as their model's
|
||||||
|
`tok2vec` argument, which connects to the `tok2vec` component model.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### Shared {highlight="1-2,4-5,19-20"}
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
```
|
||||||
|
|
||||||
|
In the independent setup, the entity recognizer component defines its own
|
||||||
|
[Tok2Vec](/api/architectures#Tok2Vec) instance. Other components will do the
|
||||||
|
same. This makes them fully independent and doesn't require an upstream
|
||||||
|
[`Tok2Vec`](/api/tok2vec) component to be present in the pipeline.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### Independent {highlight="7-8"}
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v1"
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2Vec.v1"
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
```
|
||||||
|
|
||||||
<!-- TODO: Once rehearsal is tested, mention it here. -->
|
<!-- TODO: Once rehearsal is tested, mention it here. -->
|
||||||
|
|
||||||
|
@ -503,3 +558,22 @@ def MyCustomVectors(
|
||||||
## Pretraining {#pretraining}
|
## Pretraining {#pretraining}
|
||||||
|
|
||||||
<!-- TODO: write -->
|
<!-- TODO: write -->
|
||||||
|
|
||||||
|
> #### Raw text format
|
||||||
|
>
|
||||||
|
> The raw text can be provided as JSONL (newline-delimited JSON) with a key
|
||||||
|
> `"text"` per entry. This allows the data to be read in line by line, while
|
||||||
|
> also allowing you to include newlines in the texts.
|
||||||
|
>
|
||||||
|
> ```json
|
||||||
|
> {"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
|
||||||
|
> {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining
|
||||||
|
```
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg
|
||||||
|
```
|
||||||
|
|
|
@ -82,6 +82,14 @@ check whether a [`Doc`](/api/doc) object has been parsed with the
|
||||||
`doc.is_parsed` attribute, which returns a boolean value. If this attribute is
|
`doc.is_parsed` attribute, which returns a boolean value. If this attribute is
|
||||||
`False`, the default sentence iterator will raise an exception.
|
`False`, the default sentence iterator will raise an exception.
|
||||||
|
|
||||||
|
<Infobox title="Dependency label scheme" emoji="📖">
|
||||||
|
|
||||||
|
For a list of the syntactic dependency labels assigned by spaCy's models across
|
||||||
|
different languages, see the label schemes documented in the
|
||||||
|
[models directory](/models).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### Noun chunks {#noun-chunks}
|
### Noun chunks {#noun-chunks}
|
||||||
|
|
||||||
Noun chunks are "base noun phrases" – flat phrases that have a noun as their
|
Noun chunks are "base noun phrases" – flat phrases that have a noun as their
|
||||||
|
@ -288,11 +296,45 @@ for token in doc:
|
||||||
| their | `ADJ` | `poss` | requests |
|
| their | `ADJ` | `poss` | requests |
|
||||||
| requests | `NOUN` | `dobj` | submit |
|
| requests | `NOUN` | `dobj` | submit |
|
||||||
|
|
||||||
<Infobox title="Dependency label scheme" emoji="📖">
|
The dependency parse can be a useful tool for **information extraction**,
|
||||||
|
especially when combined with other predictions like
|
||||||
|
[named entities](#named-entities). The following example extracts money and
|
||||||
|
currency values, i.e. entities labeled as `MONEY`, and then uses the dependency
|
||||||
|
parse to find the noun phrase they are referring to – for example `"Net income"`
|
||||||
|
→ `"$9.4 million"`.
|
||||||
|
|
||||||
For a list of the syntactic dependency labels assigned by spaCy's models across
|
```python
|
||||||
different languages, see the label schemes documented in the
|
### {executable="true"}
|
||||||
[models directory](/models).
|
import spacy
|
||||||
|
|
||||||
|
nlp = spacy.load("en_core_web_sm")
|
||||||
|
# Merge noun phrases and entities for easier analysis
|
||||||
|
nlp.add_pipe("merge_entities")
|
||||||
|
nlp.add_pipe("merge_noun_chunks")
|
||||||
|
|
||||||
|
TEXTS = [
|
||||||
|
"Net income was $9.4 million compared to the prior year of $2.7 million.",
|
||||||
|
"Revenue exceeded twelve billion dollars, with a loss of $1b.",
|
||||||
|
]
|
||||||
|
for doc in nlp.pipe(TEXTS):
|
||||||
|
for token in doc:
|
||||||
|
if token.ent_type_ == "MONEY":
|
||||||
|
# We have an attribute and direct object, so check for subject
|
||||||
|
if token.dep_ in ("attr", "dobj"):
|
||||||
|
subj = [w for w in token.head.lefts if w.dep_ == "nsubj"]
|
||||||
|
if subj:
|
||||||
|
print(subj[0], "-->", token)
|
||||||
|
# We have a prepositional object with a preposition
|
||||||
|
elif token.dep_ == "pobj" and token.head.dep_ == "prep":
|
||||||
|
print(token.head.head, "-->", token)
|
||||||
|
```
|
||||||
|
|
||||||
|
<Infobox title="Combining models and rules" emoji="📖">
|
||||||
|
|
||||||
|
For more examples of how to write rule-based information extraction logic that
|
||||||
|
takes advantage of the model's predictions produced by the different components,
|
||||||
|
see the usage guide on
|
||||||
|
[combining models and rules](/usage/rule-based-matching#models-rules).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
@ -545,7 +587,7 @@ identifier from a knowledge base (KB). You can create your own
|
||||||
[train a new Entity Linking model](/usage/training#entity-linker) using that
|
[train a new Entity Linking model](/usage/training#entity-linker) using that
|
||||||
custom-made KB.
|
custom-made KB.
|
||||||
|
|
||||||
### Accessing entity identifiers {#entity-linking-accessing}
|
### Accessing entity identifiers {#entity-linking-accessing model="entity linking"}
|
||||||
|
|
||||||
The annotated KB identifier is accessible as either a hash value or as a string,
|
The annotated KB identifier is accessible as either a hash value or as a string,
|
||||||
using the attributes `ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span)
|
using the attributes `ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span)
|
||||||
|
@ -571,15 +613,6 @@ print(ent_ada_1) # ['Lovelace', 'PERSON', 'Q7259']
|
||||||
print(ent_london_5) # ['London', 'GPE', 'Q84']
|
print(ent_london_5) # ['London', 'GPE', 'Q84']
|
||||||
```
|
```
|
||||||
|
|
||||||
| Text | ent_type\_ | ent_kb_id\_ |
|
|
||||||
| -------- | ---------- | ----------- |
|
|
||||||
| Ada | `"PERSON"` | `"Q7259"` |
|
|
||||||
| Lovelace | `"PERSON"` | `"Q7259"` |
|
|
||||||
| was | - | - |
|
|
||||||
| born | - | - |
|
|
||||||
| in | - | - |
|
|
||||||
| London | `"GPE"` | `"Q84"` |
|
|
||||||
|
|
||||||
## Tokenization {#tokenization}
|
## Tokenization {#tokenization}
|
||||||
|
|
||||||
Tokenization is the task of splitting a text into meaningful segments, called
|
Tokenization is the task of splitting a text into meaningful segments, called
|
||||||
|
|
|
@ -88,6 +88,12 @@ can also use any private repo you have access to with Git.
|
||||||
> - dest: 'assets/training.spacy'
|
> - dest: 'assets/training.spacy'
|
||||||
> url: 'https://example.com/data.spacy'
|
> url: 'https://example.com/data.spacy'
|
||||||
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||||
|
> - dest: 'assets/development.spacy'
|
||||||
|
> git:
|
||||||
|
> repo: 'https://github.com/example/repo'
|
||||||
|
> branch: 'master'
|
||||||
|
> path: 'path/developments.spacy'
|
||||||
|
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Assets are data files your project needs – for example, the training and
|
Assets are data files your project needs – for example, the training and
|
||||||
|
@ -104,22 +110,8 @@ $ python -m spacy project assets
|
||||||
|
|
||||||
Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and
|
Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and
|
||||||
even cloud storage such as GCS and S3. You can also fetch assets using git, by
|
even cloud storage such as GCS and S3. You can also fetch assets using git, by
|
||||||
replacing the `url` string with a `git` block, like this:
|
replacing the `url` string with a `git` block. spaCy will use Git's "sparse
|
||||||
|
checkout" feature, to avoid download the whole repository.
|
||||||
> #### project.yml
|
|
||||||
>
|
|
||||||
> ```yaml
|
|
||||||
> assets:
|
|
||||||
> - dest: 'assets/training.spacy'
|
|
||||||
> git:
|
|
||||||
> repo: "https://github.com/example/repo"
|
|
||||||
> branch: "master"
|
|
||||||
> path: "some/path"
|
|
||||||
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
|
||||||
> ```
|
|
||||||
|
|
||||||
spaCy will use Git's "sparse checkout" feature, to avoid download the whole
|
|
||||||
repository.
|
|
||||||
|
|
||||||
### 3. Run a command {#run}
|
### 3. Run a command {#run}
|
||||||
|
|
||||||
|
@ -236,10 +228,93 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.
|
||||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. |
|
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
||||||
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||||
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
|
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
|
||||||
|
|
||||||
|
### Data assets {#data-assets}
|
||||||
|
|
||||||
|
Assets are any files that your project might need, like training and development
|
||||||
|
corpora or pretrained weights for initializing your model. Assets are defined in
|
||||||
|
the `assets` block of your `project.yml` and can be downloaded using the
|
||||||
|
[`project assets`](/api/cli#project-assets) command. Defining checksums lets you
|
||||||
|
verify that someone else running your project will use the same files you used.
|
||||||
|
Asset URLs can be a number of different **protocols**: HTTP, HTTPS, FTP, SSH,
|
||||||
|
and even **cloud storage** such as GCS and S3. You can also download assets from
|
||||||
|
a **Git repo** instead.
|
||||||
|
|
||||||
|
#### Downloading from a URL or cloud storage {#data-assets-url}
|
||||||
|
|
||||||
|
Under the hood, spaCy uses the
|
||||||
|
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library so you
|
||||||
|
can use any protocol it supports. Note that you may need to install extra
|
||||||
|
dependencies to use certain protocols.
|
||||||
|
|
||||||
|
> #### project.yml
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> assets:
|
||||||
|
> # Download from public HTTPS URL
|
||||||
|
> - dest: 'assets/training.spacy'
|
||||||
|
> url: 'https://example.com/data.spacy'
|
||||||
|
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||||
|
> # Download from Google Cloud Storage bucket
|
||||||
|
> - dest: 'assets/development.spacy'
|
||||||
|
> url: 'gs://your-bucket/corpora'
|
||||||
|
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
||||||
|
| `url` | The URL to download from, using the respective protocol. |
|
||||||
|
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
||||||
|
|
||||||
|
#### Downloading from a Git repo {#data-assets-git}
|
||||||
|
|
||||||
|
If a `git` block is provided, the asset is downloaded from the given Git
|
||||||
|
repository. You can download from any repo that you have access to. Under the
|
||||||
|
hood, this uses Git's "sparse checkout" feature, so you're only downloading the
|
||||||
|
files you need and not the whole repo.
|
||||||
|
|
||||||
|
> #### project.yml
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> assets:
|
||||||
|
> - dest: 'assets/training.spacy'
|
||||||
|
> git:
|
||||||
|
> repo: 'https://github.com/example/repo'
|
||||||
|
> branch: 'master'
|
||||||
|
> path: 'path/training.spacy'
|
||||||
|
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
||||||
|
| `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root.<br />`branch`: The branch to download from. Defaults to `"master"`. |
|
||||||
|
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
||||||
|
|
||||||
|
#### Working with private assets {#data-asets-private}
|
||||||
|
|
||||||
|
> #### project.yml
|
||||||
|
>
|
||||||
|
> ```yaml
|
||||||
|
> assets:
|
||||||
|
> - dest: 'assets/private_training_data.json'
|
||||||
|
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||||
|
> - dest: 'assets/private_vectors.bin'
|
||||||
|
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||||
|
> ```
|
||||||
|
|
||||||
|
For many projects, the datasets and weights you're working with might be
|
||||||
|
company-internal and not available over the internet. In that case, you can
|
||||||
|
specify the destination paths and a checksum, and leave out the URL. When your
|
||||||
|
teammates clone and run your project, they can place the files in the respective
|
||||||
|
directory themselves. The [`project assets`](/api/cli#project-assets) command
|
||||||
|
will alert about missing files and mismatched checksums, so you can ensure that
|
||||||
|
others are running your project with the same data.
|
||||||
|
|
||||||
### Dependencies and outputs {#deps-outputs}
|
### Dependencies and outputs {#deps-outputs}
|
||||||
|
|
||||||
Each command defined in the `project.yml` can optionally define a list of
|
Each command defined in the `project.yml` can optionally define a list of
|
||||||
|
@ -446,25 +521,6 @@ projects.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
### Working with private assets {#private-assets}
|
|
||||||
|
|
||||||
For many projects, the datasets and weights you're working with might be
|
|
||||||
company-internal and not available via a public URL. In that case, you can
|
|
||||||
specify the destination paths and a checksum, and leave out the URL. When your
|
|
||||||
teammates clone and run your project, they can place the files in the respective
|
|
||||||
directory themselves. The [`spacy project assets`](/api/cli#project-assets)
|
|
||||||
command will alert about missing files and mismatched checksums, so you can
|
|
||||||
ensure that others are running your project with the same data.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
### project.yml
|
|
||||||
assets:
|
|
||||||
- dest: 'assets/private_training_data.json'
|
|
||||||
checksum: '63373dd656daa1fd3043ce166a59474c'
|
|
||||||
- dest: 'assets/private_vectors.bin'
|
|
||||||
checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
|
||||||
```
|
|
||||||
|
|
||||||
## Remote Storage {#remote}
|
## Remote Storage {#remote}
|
||||||
|
|
||||||
You can persist your project outputs to a remote storage using the
|
You can persist your project outputs to a remote storage using the
|
||||||
|
|
|
@ -365,6 +365,8 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
[`DependencyMatcher.add`](/api/dependencymatcher#add) now only accept a list
|
[`DependencyMatcher.add`](/api/dependencymatcher#add) now only accept a list
|
||||||
of patterns as the second argument (instead of a variable number of
|
of patterns as the second argument (instead of a variable number of
|
||||||
arguments). The `on_match` callback becomes an optional keyword argument.
|
arguments). The `on_match` callback becomes an optional keyword argument.
|
||||||
|
- The `PRON_LEMMA` symbol and `-PRON-` as an indicator for pronoun lemmas has
|
||||||
|
been removed.
|
||||||
|
|
||||||
### Removed or renamed API {#incompat-removed}
|
### Removed or renamed API {#incompat-removed}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user