From 8ac5ef1284cffe6c667bb406901cbb4d0b7e1dd6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Aug 2020 11:54:37 +0200 Subject: [PATCH 01/11] Update docs --- website/docs/api/cli.md | 6 +- website/docs/api/data-formats.md | 3 +- website/docs/usage/101/_pipelines.md | 2 + website/docs/usage/embeddings-transformers.md | 76 ++++++++++- website/docs/usage/projects.md | 128 +++++++++++++----- website/docs/usage/v3.md | 2 + 6 files changed, 177 insertions(+), 40 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 7ce95c019..967a96dda 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -660,8 +660,10 @@ for more info. As of spaCy v3.0, the `pretrain` command takes the same [config file](/usage/training#config) as the `train` command. This ensures that settings are consistent between pretraining and training. Settings for -pretraining can be defined in the `[pretraining]` block of the config file. See -the [data format](/api/data-formats#config) for details. +pretraining can be defined in the `[pretraining]` block of the config file and +auto-generated by setting `--pretraining` on +[`init fill-config`](/api/cli#init-fill-config). Also see the +[data format](/api/data-formats#config) for details. diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 727c0f35c..9375600a1 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -375,7 +375,8 @@ The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the "token-to-vector" embedding layer of pipeline components from raw text. Raw text can be provided as a `.jsonl` (newline-delimited JSON) file containing one input text per line (roughly paragraph length is good). Optionally, custom -tokenization can be provided. +tokenization can be provided. The JSONL format means that the texts can be read +in line-by-line, while still making it easy to represent newlines in the data. > #### Tip: Writing JSONL > diff --git a/website/docs/usage/101/_pipelines.md b/website/docs/usage/101/_pipelines.md index 295aa6e52..f85978d99 100644 --- a/website/docs/usage/101/_pipelines.md +++ b/website/docs/usage/101/_pipelines.md @@ -43,6 +43,8 @@ recognizer doesn't use any features set by the tagger and parser, and so on. This means that you can swap them, or remove single components from the pipeline without affecting the others. However, components may share a "token-to-vector" component like [`Tok2Vec`](/api/tok2vec) or [`Transformer`](/api/transformer). +You can read more about this in the docs on +[embedding layers](/usage/embeddings-transformers#embedding-layers). Custom components may also depend on annotations set by other components. For example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 7648a5d45..e2c1a6fd0 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -107,7 +107,62 @@ transformer outputs to the [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, giving you access to them after the pipeline has finished running. - +### Example: Shared vs. independent config {#embedding-layers-config} + +The [config system](/usage/training#config) lets you express model configuration +for both shared and independent embedding layers. The shared setup uses a single +[`Tok2Vec`](/api/tok2vec) component with the +[Tok2Vec](/api/architectures#Tok2Vec) architecture. All other components, like +the entity recognizer, use a +[Tok2VecListener](/api/architectures#Tok2VecListener) layer as their model's +`tok2vec` argument, which connects to the `tok2vec` component model. + +```ini +### Shared {highlight="1-2,4-5,19-20"} +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" + +[components.ner] +factory = "ner" + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" + +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +``` + +In the independent setup, the entity recognizer component defines its own +[Tok2Vec](/api/architectures#Tok2Vec) instance. Other components will do the +same. This makes them fully independent and doesn't require an upstream +[`Tok2Vec`](/api/tok2vec) component to be present in the pipeline. + +```ini +### Independent {highlight="7-8"} +[components.ner] +factory = "ner" + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" + +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2Vec.v1" + +[components.ner.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" + +[components.ner.model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +``` @@ -503,3 +558,22 @@ def MyCustomVectors( ## Pretraining {#pretraining} + +> #### Raw text format +> +> The raw text can be provided as JSONL (newline-delimited JSON) with a key +> `"text"` per entry. This allows the data to be read in line by line, while +> also allowing you to include newlines in the texts. +> +> ```json +> {"text": "Can I ask where you work now and what you do, and if you enjoy it?"} +> {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."} +> ``` + +```cli +$ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining +``` + +```cli +$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg +``` diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 02551838c..6a32ab5d4 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -88,6 +88,12 @@ can also use any private repo you have access to with Git. > - dest: 'assets/training.spacy' > url: 'https://example.com/data.spacy' > checksum: '63373dd656daa1fd3043ce166a59474c' +> - dest: 'assets/development.spacy' +> git: +> repo: 'https://github.com/example/repo' +> branch: 'master' +> path: 'path/developments.spacy' +> checksum: '5113dc04e03f079525edd8df3f4f39e3' > ``` Assets are data files your project needs – for example, the training and @@ -104,22 +110,8 @@ $ python -m spacy project assets Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and even cloud storage such as GCS and S3. You can also fetch assets using git, by -replacing the `url` string with a `git` block, like this: - -> #### project.yml -> -> ```yaml -> assets: -> - dest: 'assets/training.spacy' -> git: -> repo: "https://github.com/example/repo" -> branch: "master" -> path: "some/path" -> checksum: '63373dd656daa1fd3043ce166a59474c' -> ``` - -spaCy will use Git's "sparse checkout" feature, to avoid download the whole -repository. +replacing the `url` string with a `git` block. spaCy will use Git's "sparse +checkout" feature, to avoid download the whole repository. ### 3. Run a command {#run} @@ -236,10 +228,93 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project. | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | -| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. | +| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | | `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | | `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. | +### Data assets {#data-assets} + +Assets are any files that your project might need, like training and development +corpora or pretrained weights for initializing your model. Assets are defined in +the `assets` block of your `project.yml` and can be downloaded using the +[`project assets`](/api/cli#project-assets) command. Defining checksums lets you +verify that someone else running your project will use the same files you used. +Asset URLs can be a number of different **protocols**: HTTP, HTTPS, FTP, SSH, +and even **cloud storage** such as GCS and S3. You can also download assets from +a **Git repo** instead. + +#### Downloading from a URL or cloud storage {#data-assets-url} + +Under the hood, spaCy uses the +[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library so you +can use any protocol it supports. Note that you may need to install extra +dependencies to use certain protocols. + +> #### project.yml +> +> ```yaml +> assets: +> # Download from public HTTPS URL +> - dest: 'assets/training.spacy' +> url: 'https://example.com/data.spacy' +> checksum: '63373dd656daa1fd3043ce166a59474c' +> # Download from Google Cloud Storage bucket +> - dest: 'assets/development.spacy' +> url: 'gs://your-bucket/corpora' +> checksum: '5113dc04e03f079525edd8df3f4f39e3' +> ``` + +| Name | Description | +| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | +| `url` | The URL to download from, using the respective protocol. | +| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | + +#### Downloading from a Git repo {#data-assets-git} + +If a `git` block is provided, the asset is downloaded from the given Git +repository. You can download from any repo that you have access to. Under the +hood, this uses Git's "sparse checkout" feature, so you're only downloading the +files you need and not the whole repo. + +> #### project.yml +> +> ```yaml +> assets: +> - dest: 'assets/training.spacy' +> git: +> repo: 'https://github.com/example/repo' +> branch: 'master' +> path: 'path/training.spacy' +> checksum: '63373dd656daa1fd3043ce166a59474c' +> ``` + +| Name | Description | +| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | +| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root.
`branch`: The branch to download from. Defaults to `"master"`. | +| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | + +#### Working with private assets {#data-asets-private} + +> #### project.yml +> +> ```yaml +> assets: +> - dest: 'assets/private_training_data.json' +> checksum: '63373dd656daa1fd3043ce166a59474c' +> - dest: 'assets/private_vectors.bin' +> checksum: '5113dc04e03f079525edd8df3f4f39e3' +> ``` + +For many projects, the datasets and weights you're working with might be +company-internal and not available over the internet. In that case, you can +specify the destination paths and a checksum, and leave out the URL. When your +teammates clone and run your project, they can place the files in the respective +directory themselves. The [`project assets`](/api/cli#project-assets) command +will alert about missing files and mismatched checksums, so you can ensure that +others are running your project with the same data. + ### Dependencies and outputs {#deps-outputs} Each command defined in the `project.yml` can optionally define a list of @@ -446,25 +521,6 @@ projects. -### Working with private assets {#private-assets} - -For many projects, the datasets and weights you're working with might be -company-internal and not available via a public URL. In that case, you can -specify the destination paths and a checksum, and leave out the URL. When your -teammates clone and run your project, they can place the files in the respective -directory themselves. The [`spacy project assets`](/api/cli#project-assets) -command will alert about missing files and mismatched checksums, so you can -ensure that others are running your project with the same data. - -```yaml -### project.yml -assets: - - dest: 'assets/private_training_data.json' - checksum: '63373dd656daa1fd3043ce166a59474c' - - dest: 'assets/private_vectors.bin' - checksum: '5113dc04e03f079525edd8df3f4f39e3' -``` - ## Remote Storage {#remote} You can persist your project outputs to a remote storage using the diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 2a47fd264..bf0c13b68 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -365,6 +365,8 @@ Note that spaCy v3.0 now requires **Python 3.6+**. [`DependencyMatcher.add`](/api/dependencymatcher#add) now only accept a list of patterns as the second argument (instead of a variable number of arguments). The `on_match` callback becomes an optional keyword argument. +- The `PRON_LEMMA` symbol and `-PRON-` as an indicator for pronoun lemmas has + been removed. ### Removed or renamed API {#incompat-removed} From dd84577a98614260340f374813963142650914e6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Aug 2020 11:54:53 +0200 Subject: [PATCH 02/11] Update CLI utils, project.yml schema and add test --- spacy/cli/_util.py | 6 +++--- spacy/schemas.py | 19 +++++++++++++++++-- spacy/tests/test_cli.py | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index f1ab2effc..4ed316219 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -195,7 +195,7 @@ def get_checksum(path: Union[Path, str]) -> str: for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): dir_checksum.update(sub_file.read_bytes()) return dir_checksum.hexdigest() - raise ValueError(f"Can't get checksum for {path}: not a file or directory") + msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1) @contextmanager @@ -320,9 +320,9 @@ def git_sparse_checkout( repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None ): if dest.exists(): - raise IOError("Destination of checkout must not exist") + msg.fail("Destination of checkout must not exist", exits=1) if not dest.parent.exists(): - raise IOError("Parent of destination of checkout must exist") + msg.fail("Parent of destination of checkout must exist", exits=1) # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: cmd = ( diff --git a/spacy/schemas.py b/spacy/schemas.py index 15282e98e..069bd7c0a 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -283,7 +283,15 @@ class ConfigSchema(BaseModel): # Project config Schema -class ProjectConfigAsset(BaseModel): +class ProjectConfigAssetGitItem(BaseModel): + # fmt: off + repo: StrictStr = Field(..., title="URL of Git repo to download from") + path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)") + branch: StrictStr = Field("master", title="Branch to clone from") + # fmt: on + + +class ProjectConfigAssetURL(BaseModel): # fmt: off dest: StrictStr = Field(..., title="Destination of downloaded asset") url: Optional[StrictStr] = Field(None, title="URL of asset") @@ -291,6 +299,13 @@ class ProjectConfigAsset(BaseModel): # fmt: on +class ProjectConfigAssetGit(BaseModel): + # fmt: off + git: ProjectConfigAssetGitItem = Field(..., title="Git repo information") + checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") + # fmt: on + + class ProjectConfigCommand(BaseModel): # fmt: off name: StrictStr = Field(..., title="Name of command") @@ -310,7 +325,7 @@ class ProjectConfigCommand(BaseModel): class ProjectConfigSchema(BaseModel): # fmt: off vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands") - assets: List[ProjectConfigAsset] = Field([], title="Data assets") + assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets") workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") # fmt: on diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 104c7c516..448aaf202 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -270,6 +270,41 @@ def test_pretrain_make_docs(): assert skip_count == 0 +def test_project_config_validation_full(): + config = { + "vars": {"some_var": 20}, + "directories": ["assets", "configs", "corpus", "scripts", "training"], + "assets": [ + { + "dest": "x", + "url": "https://example.com", + "checksum": "63373dd656daa1fd3043ce166a59474c", + }, + { + "dest": "y", + "git": { + "repo": "https://github.com/example/repo", + "branch": "develop", + "path": "y", + }, + }, + ], + "commands": [ + { + "name": "train", + "help": "Train a model", + "script": ["python -m spacy train config.cfg -o training"], + "deps": ["config.cfg", "corpus/training.spcy"], + "outputs": ["training/model-best"], + }, + {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True}, + ], + "workflows": {"all": ["train", "test"], "train": ["train"]}, + } + errors = validate(ProjectConfigSchema, config) + assert not errors + + @pytest.mark.parametrize( "config", [ From f31c4462ca1110f1c22c0d24dba372a49c580fe9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Aug 2020 13:27:59 +0200 Subject: [PATCH 03/11] Update docs [ci skip] --- website/docs/usage/linguistic-features.md | 61 +++++++++++++++++------ 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index f2ec48d63..5c5198308 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -82,6 +82,14 @@ check whether a [`Doc`](/api/doc) object has been parsed with the `doc.is_parsed` attribute, which returns a boolean value. If this attribute is `False`, the default sentence iterator will raise an exception. + + +For a list of the syntactic dependency labels assigned by spaCy's models across +different languages, see the label schemes documented in the +[models directory](/models). + + + ### Noun chunks {#noun-chunks} Noun chunks are "base noun phrases" – flat phrases that have a noun as their @@ -288,11 +296,45 @@ for token in doc: | their | `ADJ` | `poss` | requests | | requests | `NOUN` | `dobj` | submit | - +The dependency parse can be a useful tool for **information extraction**, +especially when combined with other predictions like +[named entities](#named-entities). The following example extracts money and +currency values, i.e. entities labeled as `MONEY`, and then uses the dependency +parse to find the noun phrase they are referring to – for example `"Net income"` +→ `"$9.4 million"`. -For a list of the syntactic dependency labels assigned by spaCy's models across -different languages, see the label schemes documented in the -[models directory](/models). +```python +### {executable="true"} +import spacy + +nlp = spacy.load("en_core_web_sm") +# Merge noun phrases and entities for easier analysis +nlp.add_pipe("merge_entities") +nlp.add_pipe("merge_noun_chunks") + +TEXTS = [ + "Net income was $9.4 million compared to the prior year of $2.7 million.", + "Revenue exceeded twelve billion dollars, with a loss of $1b.", +] +for doc in nlp.pipe(TEXTS): + for token in doc: + if token.ent_type_ == "MONEY": + # We have an attribute and direct object, so check for subject + if token.dep_ in ("attr", "dobj"): + subj = [w for w in token.head.lefts if w.dep_ == "nsubj"] + if subj: + print(subj[0], "-->", token) + # We have a prepositional object with a preposition + elif token.dep_ == "pobj" and token.head.dep_ == "prep": + print(token.head.head, "-->", token) +``` + + + +For more examples of how to write rule-based information extraction logic that +takes advantage of the model's predictions produced by the different components, +see the usage guide on +[combining models and rules](/usage/rule-based-matching#models-rules). @@ -545,7 +587,7 @@ identifier from a knowledge base (KB). You can create your own [train a new Entity Linking model](/usage/training#entity-linker) using that custom-made KB. -### Accessing entity identifiers {#entity-linking-accessing} +### Accessing entity identifiers {#entity-linking-accessing model="entity linking"} The annotated KB identifier is accessible as either a hash value or as a string, using the attributes `ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span) @@ -571,15 +613,6 @@ print(ent_ada_1) # ['Lovelace', 'PERSON', 'Q7259'] print(ent_london_5) # ['London', 'GPE', 'Q84'] ``` -| Text | ent_type\_ | ent_kb_id\_ | -| -------- | ---------- | ----------- | -| Ada | `"PERSON"` | `"Q7259"` | -| Lovelace | `"PERSON"` | `"Q7259"` | -| was | - | - | -| born | - | - | -| in | - | - | -| London | `"GPE"` | `"Q84"` | - ## Tokenization {#tokenization} Tokenization is the task of splitting a text into meaningful segments, called From 0bab7c8b91787f5b084151b81a1a770c2f59837a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 25 Aug 2020 14:21:29 +0200 Subject: [PATCH 04/11] Remove PRON_LEMMA symbol (#5968) --- spacy/symbols.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 28bbc9fc3..92607e120 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -472,7 +472,6 @@ def sort_nums(x): return x[1] -PRON_LEMMA = "-PRON-" NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)] # Unfortunate hack here, to work around problem with long cpdef enum # (which is generating an enormous amount of C++ in Cython 0.24+) From 94705c21c8a341e66aabce17619acb257cc58bd4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Aug 2020 17:13:11 +0200 Subject: [PATCH 05/11] Allow reuse on validators to prevent reload error Otherwise this will cause an error if spaCy is live reloaded, e.g. in Streamlit --- spacy/schemas.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index 069bd7c0a..0fdb1b332 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -63,7 +63,7 @@ class TokenPatternString(BaseModel): class Config: extra = "forbid" - @validator("*", pre=True, each_item=True) + @validator("*", pre=True, each_item=True, allow_reuse=True) def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -84,7 +84,7 @@ class TokenPatternNumber(BaseModel): class Config: extra = "forbid" - @validator("*", pre=True, each_item=True) + @validator("*", pre=True, each_item=True, allow_reuse=True) def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -145,7 +145,7 @@ class TokenPattern(BaseModel): allow_population_by_field_name = True alias_generator = lambda value: value.upper() - @validator("*", pre=True) + @validator("*", pre=True, allow_reuse=True) def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -265,7 +265,7 @@ class ConfigSchema(BaseModel): pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} components: Dict[str, Dict[str, Any]] - @root_validator + @root_validator(allow_reuse=True) def validate_config(cls, values): """Perform additional validation for settings with dependencies.""" pt = values.get("pretraining") From b89f6fa011c6f49e3e04041c6646c045ffe45845 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Aug 2020 17:13:33 +0200 Subject: [PATCH 06/11] Fix meta defaults and error in package command --- spacy/cli/package.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 523e8a99a..4e5038951 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -77,7 +77,9 @@ def package( meta = generate_meta(meta, msg) errors = validate(ModelMetaSchema, meta) if errors: - msg.fail("Invalid model meta.json", "\n".join(errors), exits=1) + msg.fail("Invalid model meta.json") + print("\n".join(errors)) + sys.exit(1) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v @@ -118,10 +120,10 @@ def get_meta( "lang": "en", "name": "model", "version": "0.0.0", - "description": None, - "author": None, - "email": None, - "url": None, + "description": "", + "author": "", + "email": "", + "url": "", "license": "MIT", } meta.update(existing_meta) From fdcaf86c54f931b739ea8f5a5906a1c6ec1714d8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 25 Aug 2020 17:13:50 +0200 Subject: [PATCH 07/11] Adjust docstring End sentence earlier so it's shown as a full sentence in --help --- spacy/cli/project/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index ae292e3d7..6c0f32171 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -14,7 +14,7 @@ def project_pull_cli( project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), # fmt: on ): - """Retrieve any precomputed outputs from a remote storage that are available. + """Retrieve available precomputed outputs from a remote storage. You can alias remotes in your project.yml by mapping them to storage paths. A storage can be anything that the smart-open library can upload to, e.g. gcs, aws, ssh, local directories etc From 2771e4f2b3bfcc19f6c11a6801d95f4bb595c029 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2020 04:00:14 +0200 Subject: [PATCH 08/11] Fix the git "sparse checkout" functionality (#5973) * Fix the git sparse checkout functionality * Format --- spacy/cli/_util.py | 40 ++++++++++++++++++++++++++++++---------- spacy/scorer.py | 4 ++-- spacy/util.py | 16 ++++++++++++---- 3 files changed, 44 insertions(+), 16 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 4ed316219..ebd3809fd 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -322,21 +322,41 @@ def git_sparse_checkout( if dest.exists(): msg.fail("Destination of checkout must not exist", exits=1) if not dest.parent.exists(): - msg.fail("Parent of destination of checkout must exist", exits=1) + raise IOError("Parent of destination of checkout must exist") + # We're using Git, partial clone and sparse checkout to + # only clone the files we need + # This ends up being RIDICULOUS. omg. + # So, every tutorial and SO post talks about 'sparse checkout'...But they + # go and *clone* the whole repo. Worthless. And cloning part of a repo + # turns out to be completely broken. The only way to specify a "path" is.. + # a path *on the server*? The contents of which, specifies the paths. Wat. + # Obviously this is hopelessly broken and insecure, because you can query + # arbitrary paths on the server! So nobody enables this. + # What we have to do is disable *all* files. We could then just checkout + # the path, and it'd "work", but be hopelessly slow...Because it goes and + # transfers every missing object one-by-one. So the final piece is that we + # need to use some weird git internals to fetch the missings in bulk, and + # *that* we can do by path. # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: + # This is the "clone, but don't download anything" part. cmd = ( - f"git clone {repo} {tmp_dir} --no-checkout " - "--depth 1 --config core.sparseCheckout=true" + f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " + "--filter=blob:none" # <-- The key bit ) if branch is not None: cmd = f"{cmd} -b {branch}" - run_command(cmd) - with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: - f.write(subpath) - run_command(["git", "-C", str(tmp_dir), "fetch"]) - run_command(["git", "-C", str(tmp_dir), "checkout"]) + run_command(cmd, capture=True) + # Now we need to find the missing filenames for the subpath we want. + # Looking for this 'rev-list' command in the git --help? Hah. + cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" + ret = run_command(cmd, capture=True) + missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) + # Now pass those missings into another bit of git internals + run_command( + f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings + ) + # And finally, we can checkout our subpath + run_command(f"git -C {tmp_dir} checkout {branch} {subpath}") # We need Path(name) to make sure we also support subdirectories shutil.move(str(tmp_dir / Path(subpath)), str(dest)) - print(dest) - print(list(dest.iterdir())) diff --git a/spacy/scorer.py b/spacy/scorer.py index 28975ad43..dc017f82f 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -30,11 +30,11 @@ class PRFScore: @property def precision(self) -> float: - return (self.tp / (self.tp + self.fp + 1e-100)) + return self.tp / (self.tp + self.fp + 1e-100) @property def recall(self) -> float: - return (self.tp / (self.tp + self.fn + 1e-100)) + return self.tp / (self.tp + self.fn + 1e-100) @property def fscore(self) -> float: diff --git a/spacy/util.py b/spacy/util.py index 736f4d805..eb40dfa21 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -572,7 +572,7 @@ def join_command(command: List[str]) -> str: return " ".join(shlex.quote(cmd) for cmd in command) -def run_command(command: Union[str, List[str]]) -> None: +def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None: """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. @@ -582,13 +582,21 @@ def run_command(command: Union[str, List[str]]) -> None: if isinstance(command, str): command = split_command(command) try: - status = subprocess.call(command, env=os.environ.copy()) + ret = subprocess.run( + command, + env=os.environ.copy(), + capture_output=capture, + input=stdin, + text=True, + check=True, + ) except FileNotFoundError: raise FileNotFoundError( Errors.E970.format(str_command=" ".join(command), tool=command[0]) ) from None - if status != 0: - sys.exit(status) + if ret.returncode != 0: + sys.exit(ret.returncode) + return ret @contextmanager From 65474723479a4f452772fdddefd3afd129d381ce Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2020 04:02:34 +0200 Subject: [PATCH 09/11] Set version to v3.0.0a12 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index ff8c064ca..418e44c1d 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a11" +__version__ = "3.0.0a12" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 884cac5fb5729cd808c747fe0cbb4803606b07cf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2020 04:33:42 +0200 Subject: [PATCH 10/11] Make run_command backwards compatible --- spacy/util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index eb40dfa21..57693a776 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -585,10 +585,11 @@ def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> ret = subprocess.run( command, env=os.environ.copy(), - capture_output=capture, input=stdin, text=True, check=True, + stdout=subprocess.PIPE if capture else None, + stderr=subprocess.PIPE if capture else None, ) except FileNotFoundError: raise FileNotFoundError( From 77852d242867bee6a8032da8f673c5171f28ed79 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2020 05:02:43 +0200 Subject: [PATCH 11/11] Fix run_command for python 3.6 --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 57693a776..0268ae91f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -586,7 +586,7 @@ def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> command, env=os.environ.copy(), input=stdin, - text=True, + encoding="utf8", check=True, stdout=subprocess.PIPE if capture else None, stderr=subprocess.PIPE if capture else None,