mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge branch 'develop' into refactor/vector-names
This commit is contained in:
commit
191fb4144f
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a11"
|
||||
__version__ = "3.0.0a12"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -195,7 +195,7 @@ def get_checksum(path: Union[Path, str]) -> str:
|
|||
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
||||
dir_checksum.update(sub_file.read_bytes())
|
||||
return dir_checksum.hexdigest()
|
||||
raise ValueError(f"Can't get checksum for {path}: not a file or directory")
|
||||
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
@ -320,23 +320,43 @@ def git_sparse_checkout(
|
|||
repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None
|
||||
):
|
||||
if dest.exists():
|
||||
raise IOError("Destination of checkout must not exist")
|
||||
msg.fail("Destination of checkout must not exist", exits=1)
|
||||
if not dest.parent.exists():
|
||||
raise IOError("Parent of destination of checkout must exist")
|
||||
# We're using Git, partial clone and sparse checkout to
|
||||
# only clone the files we need
|
||||
# This ends up being RIDICULOUS. omg.
|
||||
# So, every tutorial and SO post talks about 'sparse checkout'...But they
|
||||
# go and *clone* the whole repo. Worthless. And cloning part of a repo
|
||||
# turns out to be completely broken. The only way to specify a "path" is..
|
||||
# a path *on the server*? The contents of which, specifies the paths. Wat.
|
||||
# Obviously this is hopelessly broken and insecure, because you can query
|
||||
# arbitrary paths on the server! So nobody enables this.
|
||||
# What we have to do is disable *all* files. We could then just checkout
|
||||
# the path, and it'd "work", but be hopelessly slow...Because it goes and
|
||||
# transfers every missing object one-by-one. So the final piece is that we
|
||||
# need to use some weird git internals to fetch the missings in bulk, and
|
||||
# *that* we can do by path.
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
# This is the "clone, but don't download anything" part.
|
||||
cmd = (
|
||||
f"git clone {repo} {tmp_dir} --no-checkout "
|
||||
"--depth 1 --config core.sparseCheckout=true"
|
||||
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||
"--filter=blob:none" # <-- The key bit
|
||||
)
|
||||
if branch is not None:
|
||||
cmd = f"{cmd} -b {branch}"
|
||||
run_command(cmd)
|
||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||
f.write(subpath)
|
||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
||||
run_command(cmd, capture=True)
|
||||
# Now we need to find the missing filenames for the subpath we want.
|
||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
# Now pass those missings into another bit of git internals
|
||||
run_command(
|
||||
f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings
|
||||
)
|
||||
# And finally, we can checkout our subpath
|
||||
run_command(f"git -C {tmp_dir} checkout {branch} {subpath}")
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||
print(dest)
|
||||
print(list(dest.iterdir()))
|
||||
|
|
|
@ -77,7 +77,9 @@ def package(
|
|||
meta = generate_meta(meta, msg)
|
||||
errors = validate(ModelMetaSchema, meta)
|
||||
if errors:
|
||||
msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
|
||||
msg.fail("Invalid model meta.json")
|
||||
print("\n".join(errors))
|
||||
sys.exit(1)
|
||||
model_name = meta["lang"] + "_" + meta["name"]
|
||||
model_name_v = model_name + "-" + meta["version"]
|
||||
main_path = output_dir / model_name_v
|
||||
|
@ -118,10 +120,10 @@ def get_meta(
|
|||
"lang": "en",
|
||||
"name": "model",
|
||||
"version": "0.0.0",
|
||||
"description": None,
|
||||
"author": None,
|
||||
"email": None,
|
||||
"url": None,
|
||||
"description": "",
|
||||
"author": "",
|
||||
"email": "",
|
||||
"url": "",
|
||||
"license": "MIT",
|
||||
}
|
||||
meta.update(existing_meta)
|
||||
|
|
|
@ -14,7 +14,7 @@ def project_pull_cli(
|
|||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Retrieve any precomputed outputs from a remote storage that are available.
|
||||
"""Retrieve available precomputed outputs from a remote storage.
|
||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
||||
A storage can be anything that the smart-open library can upload to, e.g.
|
||||
gcs, aws, ssh, local directories etc
|
||||
|
|
|
@ -63,7 +63,7 @@ class TokenPatternString(BaseModel):
|
|||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
@validator("*", pre=True, each_item=True)
|
||||
@validator("*", pre=True, each_item=True, allow_reuse=True)
|
||||
def raise_for_none(cls, v):
|
||||
if v is None:
|
||||
raise ValueError("None / null is not allowed")
|
||||
|
@ -84,7 +84,7 @@ class TokenPatternNumber(BaseModel):
|
|||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
@validator("*", pre=True, each_item=True)
|
||||
@validator("*", pre=True, each_item=True, allow_reuse=True)
|
||||
def raise_for_none(cls, v):
|
||||
if v is None:
|
||||
raise ValueError("None / null is not allowed")
|
||||
|
@ -145,7 +145,7 @@ class TokenPattern(BaseModel):
|
|||
allow_population_by_field_name = True
|
||||
alias_generator = lambda value: value.upper()
|
||||
|
||||
@validator("*", pre=True)
|
||||
@validator("*", pre=True, allow_reuse=True)
|
||||
def raise_for_none(cls, v):
|
||||
if v is None:
|
||||
raise ValueError("None / null is not allowed")
|
||||
|
@ -265,7 +265,7 @@ class ConfigSchema(BaseModel):
|
|||
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
||||
components: Dict[str, Dict[str, Any]]
|
||||
|
||||
@root_validator
|
||||
@root_validator(allow_reuse=True)
|
||||
def validate_config(cls, values):
|
||||
"""Perform additional validation for settings with dependencies."""
|
||||
pt = values.get("pretraining")
|
||||
|
@ -283,7 +283,15 @@ class ConfigSchema(BaseModel):
|
|||
# Project config Schema
|
||||
|
||||
|
||||
class ProjectConfigAsset(BaseModel):
|
||||
class ProjectConfigAssetGitItem(BaseModel):
|
||||
# fmt: off
|
||||
repo: StrictStr = Field(..., title="URL of Git repo to download from")
|
||||
path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)")
|
||||
branch: StrictStr = Field("master", title="Branch to clone from")
|
||||
# fmt: on
|
||||
|
||||
|
||||
class ProjectConfigAssetURL(BaseModel):
|
||||
# fmt: off
|
||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||
|
@ -291,6 +299,13 @@ class ProjectConfigAsset(BaseModel):
|
|||
# fmt: on
|
||||
|
||||
|
||||
class ProjectConfigAssetGit(BaseModel):
|
||||
# fmt: off
|
||||
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
|
||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
# fmt: on
|
||||
|
||||
|
||||
class ProjectConfigCommand(BaseModel):
|
||||
# fmt: off
|
||||
name: StrictStr = Field(..., title="Name of command")
|
||||
|
@ -310,7 +325,7 @@ class ProjectConfigCommand(BaseModel):
|
|||
class ProjectConfigSchema(BaseModel):
|
||||
# fmt: off
|
||||
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
|
||||
assets: List[ProjectConfigAsset] = Field([], title="Data assets")
|
||||
assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
|
||||
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||
# fmt: on
|
||||
|
|
|
@ -30,11 +30,11 @@ class PRFScore:
|
|||
|
||||
@property
|
||||
def precision(self) -> float:
|
||||
return (self.tp / (self.tp + self.fp + 1e-100))
|
||||
return self.tp / (self.tp + self.fp + 1e-100)
|
||||
|
||||
@property
|
||||
def recall(self) -> float:
|
||||
return (self.tp / (self.tp + self.fn + 1e-100))
|
||||
return self.tp / (self.tp + self.fn + 1e-100)
|
||||
|
||||
@property
|
||||
def fscore(self) -> float:
|
||||
|
|
|
@ -472,7 +472,6 @@ def sort_nums(x):
|
|||
return x[1]
|
||||
|
||||
|
||||
PRON_LEMMA = "-PRON-"
|
||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||
|
|
|
@ -270,6 +270,41 @@ def test_pretrain_make_docs():
|
|||
assert skip_count == 0
|
||||
|
||||
|
||||
def test_project_config_validation_full():
|
||||
config = {
|
||||
"vars": {"some_var": 20},
|
||||
"directories": ["assets", "configs", "corpus", "scripts", "training"],
|
||||
"assets": [
|
||||
{
|
||||
"dest": "x",
|
||||
"url": "https://example.com",
|
||||
"checksum": "63373dd656daa1fd3043ce166a59474c",
|
||||
},
|
||||
{
|
||||
"dest": "y",
|
||||
"git": {
|
||||
"repo": "https://github.com/example/repo",
|
||||
"branch": "develop",
|
||||
"path": "y",
|
||||
},
|
||||
},
|
||||
],
|
||||
"commands": [
|
||||
{
|
||||
"name": "train",
|
||||
"help": "Train a model",
|
||||
"script": ["python -m spacy train config.cfg -o training"],
|
||||
"deps": ["config.cfg", "corpus/training.spcy"],
|
||||
"outputs": ["training/model-best"],
|
||||
},
|
||||
{"name": "test", "script": ["pytest", "custom.py"], "no_skip": True},
|
||||
],
|
||||
"workflows": {"all": ["train", "test"], "train": ["train"]},
|
||||
}
|
||||
errors = validate(ProjectConfigSchema, config)
|
||||
assert not errors
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"config",
|
||||
[
|
||||
|
|
|
@ -572,7 +572,7 @@ def join_command(command: List[str]) -> str:
|
|||
return " ".join(shlex.quote(cmd) for cmd in command)
|
||||
|
||||
|
||||
def run_command(command: Union[str, List[str]]) -> None:
|
||||
def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None:
|
||||
"""Run a command on the command line as a subprocess. If the subprocess
|
||||
returns a non-zero exit code, a system exit is performed.
|
||||
|
||||
|
@ -582,13 +582,22 @@ def run_command(command: Union[str, List[str]]) -> None:
|
|||
if isinstance(command, str):
|
||||
command = split_command(command)
|
||||
try:
|
||||
status = subprocess.call(command, env=os.environ.copy())
|
||||
ret = subprocess.run(
|
||||
command,
|
||||
env=os.environ.copy(),
|
||||
input=stdin,
|
||||
encoding="utf8",
|
||||
check=True,
|
||||
stdout=subprocess.PIPE if capture else None,
|
||||
stderr=subprocess.PIPE if capture else None,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(
|
||||
Errors.E970.format(str_command=" ".join(command), tool=command[0])
|
||||
) from None
|
||||
if status != 0:
|
||||
sys.exit(status)
|
||||
if ret.returncode != 0:
|
||||
sys.exit(ret.returncode)
|
||||
return ret
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
|
|
@ -660,8 +660,10 @@ for more info.
|
|||
As of spaCy v3.0, the `pretrain` command takes the same
|
||||
[config file](/usage/training#config) as the `train` command. This ensures that
|
||||
settings are consistent between pretraining and training. Settings for
|
||||
pretraining can be defined in the `[pretraining]` block of the config file. See
|
||||
the [data format](/api/data-formats#config) for details.
|
||||
pretraining can be defined in the `[pretraining]` block of the config file and
|
||||
auto-generated by setting `--pretraining` on
|
||||
[`init fill-config`](/api/cli#init-fill-config). Also see the
|
||||
[data format](/api/data-formats#config) for details.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
|
@ -375,7 +375,8 @@ The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the
|
|||
"token-to-vector" embedding layer of pipeline components from raw text. Raw text
|
||||
can be provided as a `.jsonl` (newline-delimited JSON) file containing one input
|
||||
text per line (roughly paragraph length is good). Optionally, custom
|
||||
tokenization can be provided.
|
||||
tokenization can be provided. The JSONL format means that the texts can be read
|
||||
in line-by-line, while still making it easy to represent newlines in the data.
|
||||
|
||||
> #### Tip: Writing JSONL
|
||||
>
|
||||
|
|
|
@ -43,6 +43,8 @@ recognizer doesn't use any features set by the tagger and parser, and so on.
|
|||
This means that you can swap them, or remove single components from the pipeline
|
||||
without affecting the others. However, components may share a "token-to-vector"
|
||||
component like [`Tok2Vec`](/api/tok2vec) or [`Transformer`](/api/transformer).
|
||||
You can read more about this in the docs on
|
||||
[embedding layers](/usage/embeddings-transformers#embedding-layers).
|
||||
|
||||
Custom components may also depend on annotations set by other components. For
|
||||
example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll
|
||||
|
|
|
@ -107,7 +107,62 @@ transformer outputs to the
|
|||
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
|
||||
giving you access to them after the pipeline has finished running.
|
||||
|
||||
<!-- TODO: show example of implementation via config, side by side -->
|
||||
### Example: Shared vs. independent config {#embedding-layers-config}
|
||||
|
||||
The [config system](/usage/training#config) lets you express model configuration
|
||||
for both shared and independent embedding layers. The shared setup uses a single
|
||||
[`Tok2Vec`](/api/tok2vec) component with the
|
||||
[Tok2Vec](/api/architectures#Tok2Vec) architecture. All other components, like
|
||||
the entity recognizer, use a
|
||||
[Tok2VecListener](/api/architectures#Tok2VecListener) layer as their model's
|
||||
`tok2vec` argument, which connects to the `tok2vec` component model.
|
||||
|
||||
```ini
|
||||
### Shared {highlight="1-2,4-5,19-20"}
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
|
||||
[components.ner]
|
||||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
```
|
||||
|
||||
In the independent setup, the entity recognizer component defines its own
|
||||
[Tok2Vec](/api/architectures#Tok2Vec) instance. Other components will do the
|
||||
same. This makes them fully independent and doesn't require an upstream
|
||||
[`Tok2Vec`](/api/tok2vec) component to be present in the pipeline.
|
||||
|
||||
```ini
|
||||
### Independent {highlight="7-8"}
|
||||
[components.ner]
|
||||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[components.ner.model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
|
||||
[components.ner.model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
```
|
||||
|
||||
<!-- TODO: Once rehearsal is tested, mention it here. -->
|
||||
|
||||
|
@ -503,3 +558,22 @@ def MyCustomVectors(
|
|||
## Pretraining {#pretraining}
|
||||
|
||||
<!-- TODO: write -->
|
||||
|
||||
> #### Raw text format
|
||||
>
|
||||
> The raw text can be provided as JSONL (newline-delimited JSON) with a key
|
||||
> `"text"` per entry. This allows the data to be read in line by line, while
|
||||
> also allowing you to include newlines in the texts.
|
||||
>
|
||||
> ```json
|
||||
> {"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
|
||||
> {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
|
||||
> ```
|
||||
|
||||
```cli
|
||||
$ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining
|
||||
```
|
||||
|
||||
```cli
|
||||
$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg
|
||||
```
|
||||
|
|
|
@ -82,6 +82,14 @@ check whether a [`Doc`](/api/doc) object has been parsed with the
|
|||
`doc.is_parsed` attribute, which returns a boolean value. If this attribute is
|
||||
`False`, the default sentence iterator will raise an exception.
|
||||
|
||||
<Infobox title="Dependency label scheme" emoji="📖">
|
||||
|
||||
For a list of the syntactic dependency labels assigned by spaCy's models across
|
||||
different languages, see the label schemes documented in the
|
||||
[models directory](/models).
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Noun chunks {#noun-chunks}
|
||||
|
||||
Noun chunks are "base noun phrases" – flat phrases that have a noun as their
|
||||
|
@ -288,11 +296,45 @@ for token in doc:
|
|||
| their | `ADJ` | `poss` | requests |
|
||||
| requests | `NOUN` | `dobj` | submit |
|
||||
|
||||
<Infobox title="Dependency label scheme" emoji="📖">
|
||||
The dependency parse can be a useful tool for **information extraction**,
|
||||
especially when combined with other predictions like
|
||||
[named entities](#named-entities). The following example extracts money and
|
||||
currency values, i.e. entities labeled as `MONEY`, and then uses the dependency
|
||||
parse to find the noun phrase they are referring to – for example `"Net income"`
|
||||
→ `"$9.4 million"`.
|
||||
|
||||
For a list of the syntactic dependency labels assigned by spaCy's models across
|
||||
different languages, see the label schemes documented in the
|
||||
[models directory](/models).
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
# Merge noun phrases and entities for easier analysis
|
||||
nlp.add_pipe("merge_entities")
|
||||
nlp.add_pipe("merge_noun_chunks")
|
||||
|
||||
TEXTS = [
|
||||
"Net income was $9.4 million compared to the prior year of $2.7 million.",
|
||||
"Revenue exceeded twelve billion dollars, with a loss of $1b.",
|
||||
]
|
||||
for doc in nlp.pipe(TEXTS):
|
||||
for token in doc:
|
||||
if token.ent_type_ == "MONEY":
|
||||
# We have an attribute and direct object, so check for subject
|
||||
if token.dep_ in ("attr", "dobj"):
|
||||
subj = [w for w in token.head.lefts if w.dep_ == "nsubj"]
|
||||
if subj:
|
||||
print(subj[0], "-->", token)
|
||||
# We have a prepositional object with a preposition
|
||||
elif token.dep_ == "pobj" and token.head.dep_ == "prep":
|
||||
print(token.head.head, "-->", token)
|
||||
```
|
||||
|
||||
<Infobox title="Combining models and rules" emoji="📖">
|
||||
|
||||
For more examples of how to write rule-based information extraction logic that
|
||||
takes advantage of the model's predictions produced by the different components,
|
||||
see the usage guide on
|
||||
[combining models and rules](/usage/rule-based-matching#models-rules).
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -545,7 +587,7 @@ identifier from a knowledge base (KB). You can create your own
|
|||
[train a new Entity Linking model](/usage/training#entity-linker) using that
|
||||
custom-made KB.
|
||||
|
||||
### Accessing entity identifiers {#entity-linking-accessing}
|
||||
### Accessing entity identifiers {#entity-linking-accessing model="entity linking"}
|
||||
|
||||
The annotated KB identifier is accessible as either a hash value or as a string,
|
||||
using the attributes `ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span)
|
||||
|
@ -571,15 +613,6 @@ print(ent_ada_1) # ['Lovelace', 'PERSON', 'Q7259']
|
|||
print(ent_london_5) # ['London', 'GPE', 'Q84']
|
||||
```
|
||||
|
||||
| Text | ent_type\_ | ent_kb_id\_ |
|
||||
| -------- | ---------- | ----------- |
|
||||
| Ada | `"PERSON"` | `"Q7259"` |
|
||||
| Lovelace | `"PERSON"` | `"Q7259"` |
|
||||
| was | - | - |
|
||||
| born | - | - |
|
||||
| in | - | - |
|
||||
| London | `"GPE"` | `"Q84"` |
|
||||
|
||||
## Tokenization {#tokenization}
|
||||
|
||||
Tokenization is the task of splitting a text into meaningful segments, called
|
||||
|
|
|
@ -88,6 +88,12 @@ can also use any private repo you have access to with Git.
|
|||
> - dest: 'assets/training.spacy'
|
||||
> url: 'https://example.com/data.spacy'
|
||||
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||
> - dest: 'assets/development.spacy'
|
||||
> git:
|
||||
> repo: 'https://github.com/example/repo'
|
||||
> branch: 'master'
|
||||
> path: 'path/developments.spacy'
|
||||
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||
> ```
|
||||
|
||||
Assets are data files your project needs – for example, the training and
|
||||
|
@ -104,22 +110,8 @@ $ python -m spacy project assets
|
|||
|
||||
Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and
|
||||
even cloud storage such as GCS and S3. You can also fetch assets using git, by
|
||||
replacing the `url` string with a `git` block, like this:
|
||||
|
||||
> #### project.yml
|
||||
>
|
||||
> ```yaml
|
||||
> assets:
|
||||
> - dest: 'assets/training.spacy'
|
||||
> git:
|
||||
> repo: "https://github.com/example/repo"
|
||||
> branch: "master"
|
||||
> path: "some/path"
|
||||
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||
> ```
|
||||
|
||||
spaCy will use Git's "sparse checkout" feature, to avoid download the whole
|
||||
repository.
|
||||
replacing the `url` string with a `git` block. spaCy will use Git's "sparse
|
||||
checkout" feature, to avoid download the whole repository.
|
||||
|
||||
### 3. Run a command {#run}
|
||||
|
||||
|
@ -236,10 +228,93 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.
|
|||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. |
|
||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
||||
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
|
||||
|
||||
### Data assets {#data-assets}
|
||||
|
||||
Assets are any files that your project might need, like training and development
|
||||
corpora or pretrained weights for initializing your model. Assets are defined in
|
||||
the `assets` block of your `project.yml` and can be downloaded using the
|
||||
[`project assets`](/api/cli#project-assets) command. Defining checksums lets you
|
||||
verify that someone else running your project will use the same files you used.
|
||||
Asset URLs can be a number of different **protocols**: HTTP, HTTPS, FTP, SSH,
|
||||
and even **cloud storage** such as GCS and S3. You can also download assets from
|
||||
a **Git repo** instead.
|
||||
|
||||
#### Downloading from a URL or cloud storage {#data-assets-url}
|
||||
|
||||
Under the hood, spaCy uses the
|
||||
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library so you
|
||||
can use any protocol it supports. Note that you may need to install extra
|
||||
dependencies to use certain protocols.
|
||||
|
||||
> #### project.yml
|
||||
>
|
||||
> ```yaml
|
||||
> assets:
|
||||
> # Download from public HTTPS URL
|
||||
> - dest: 'assets/training.spacy'
|
||||
> url: 'https://example.com/data.spacy'
|
||||
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||
> # Download from Google Cloud Storage bucket
|
||||
> - dest: 'assets/development.spacy'
|
||||
> url: 'gs://your-bucket/corpora'
|
||||
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
||||
| `url` | The URL to download from, using the respective protocol. |
|
||||
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
||||
|
||||
#### Downloading from a Git repo {#data-assets-git}
|
||||
|
||||
If a `git` block is provided, the asset is downloaded from the given Git
|
||||
repository. You can download from any repo that you have access to. Under the
|
||||
hood, this uses Git's "sparse checkout" feature, so you're only downloading the
|
||||
files you need and not the whole repo.
|
||||
|
||||
> #### project.yml
|
||||
>
|
||||
> ```yaml
|
||||
> assets:
|
||||
> - dest: 'assets/training.spacy'
|
||||
> git:
|
||||
> repo: 'https://github.com/example/repo'
|
||||
> branch: 'master'
|
||||
> path: 'path/training.spacy'
|
||||
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
||||
| `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root.<br />`branch`: The branch to download from. Defaults to `"master"`. |
|
||||
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
||||
|
||||
#### Working with private assets {#data-asets-private}
|
||||
|
||||
> #### project.yml
|
||||
>
|
||||
> ```yaml
|
||||
> assets:
|
||||
> - dest: 'assets/private_training_data.json'
|
||||
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||
> - dest: 'assets/private_vectors.bin'
|
||||
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||
> ```
|
||||
|
||||
For many projects, the datasets and weights you're working with might be
|
||||
company-internal and not available over the internet. In that case, you can
|
||||
specify the destination paths and a checksum, and leave out the URL. When your
|
||||
teammates clone and run your project, they can place the files in the respective
|
||||
directory themselves. The [`project assets`](/api/cli#project-assets) command
|
||||
will alert about missing files and mismatched checksums, so you can ensure that
|
||||
others are running your project with the same data.
|
||||
|
||||
### Dependencies and outputs {#deps-outputs}
|
||||
|
||||
Each command defined in the `project.yml` can optionally define a list of
|
||||
|
@ -446,25 +521,6 @@ projects.
|
|||
|
||||
</Infobox>
|
||||
|
||||
### Working with private assets {#private-assets}
|
||||
|
||||
For many projects, the datasets and weights you're working with might be
|
||||
company-internal and not available via a public URL. In that case, you can
|
||||
specify the destination paths and a checksum, and leave out the URL. When your
|
||||
teammates clone and run your project, they can place the files in the respective
|
||||
directory themselves. The [`spacy project assets`](/api/cli#project-assets)
|
||||
command will alert about missing files and mismatched checksums, so you can
|
||||
ensure that others are running your project with the same data.
|
||||
|
||||
```yaml
|
||||
### project.yml
|
||||
assets:
|
||||
- dest: 'assets/private_training_data.json'
|
||||
checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||
- dest: 'assets/private_vectors.bin'
|
||||
checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||
```
|
||||
|
||||
## Remote Storage {#remote}
|
||||
|
||||
You can persist your project outputs to a remote storage using the
|
||||
|
|
|
@ -365,6 +365,8 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
|||
[`DependencyMatcher.add`](/api/dependencymatcher#add) now only accept a list
|
||||
of patterns as the second argument (instead of a variable number of
|
||||
arguments). The `on_match` callback becomes an optional keyword argument.
|
||||
- The `PRON_LEMMA` symbol and `-PRON-` as an indicator for pronoun lemmas has
|
||||
been removed.
|
||||
|
||||
### Removed or renamed API {#incompat-removed}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user