From 691e0088cfa425dc81dc6e1e8e3d1d5ee54550db Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Sep 2019 11:22:03 +0200 Subject: [PATCH 1/8] Remove duplicate tok2vec property (closes #4302) --- spacy/syntax/nn_parser.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index c4edef137..85f7b5bb9 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -130,10 +130,6 @@ cdef class Parser: def __reduce__(self): return (Parser, (self.vocab, self.moves, self.model), None, None) - @property - def tok2vec(self): - return self.model.tok2vec - @property def move_names(self): names = [] From 198b7e978964f44657b0e0b50740ec447baa4512 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Sep 2019 14:48:35 +0200 Subject: [PATCH 2/8] Auto-format [ci skip] --- website/docs/api/annotation.md | 204 ++++++++++++++++----------------- website/docs/api/cli.md | 60 +++++----- 2 files changed, 132 insertions(+), 132 deletions(-) diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md index ac888cec9..71ff8d5d5 100644 --- a/website/docs/api/annotation.md +++ b/website/docs/api/annotation.md @@ -186,63 +186,63 @@ The German part-of-speech tagger uses the annotation scheme. We also map the tags to the simpler Google Universal POS tag set. -| Tag |  POS | Morphology | Description | -| --------- | ------- | ------------------------------------------- | ------------------------------------------------- | -| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark | -| `$,` | `PUNCT` | `PunctType=comm` | comma | -| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark | -| `ADJA` | `ADJ` | | adjective, attributive | -| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative | -| `ADV` | `ADV` | | adverb | -| `APPO` | `ADP` | `AdpType=post` | postposition | -| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left | -| `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article | -| `APZR` | `ADP` | `AdpType=circ` | circumposition right | -| `ART` | `DET` | `PronType=art` | definite or indefinite article | -| `CARD` | `NUM` | `NumType=card` | cardinal number | -| `FM` | `X` | `Foreign=yes` | foreign language material | -| `ITJ` | `INTJ` | | interjection | -| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction | -| `KON` | `CONJ` | | coordinate conjunction | -| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive | -| `KOUS` | `SCONJ` | | subordinate conjunction with sentence | -| `NE` | `PROPN` | | proper noun | -| `NNE` | `PROPN` | | proper noun | -| `NN` | `NOUN` | | noun, singular or mass | -| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb | -| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun | -| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun | -| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner | -| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun | -| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun | -| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun | -| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun | -| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun | -| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun | -| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun | -| `PTKA` | `PART` | | particle with adjective or adverb | -| `PTKANT` | `PART` | `PartType=res` | answer particle | -| `PTKNEG` | `PART` | `Negative=yes` | negative particle | -| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle | -| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive | -| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun | -| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun | -| `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun | -| `TRUNC` | `X` | `Hyph=yes` | word remnant | -| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary | -| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary | -| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary | -| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary | -| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal | -| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal | -| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal | -| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full | -| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full | -| `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full | -| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full | -| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full | -| `XY` | `X` | | non-word containing non-letter | -| `SP` | `SPACE` | | space | +| Tag |  POS | Morphology | Description | +| --------- | ------- | ---------------------------------------- | ------------------------------------------------- | +| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark | +| `$,` | `PUNCT` | `PunctType=comm` | comma | +| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark | +| `ADJA` | `ADJ` | | adjective, attributive | +| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative | +| `ADV` | `ADV` | | adverb | +| `APPO` | `ADP` | `AdpType=post` | postposition | +| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left | +| `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article | +| `APZR` | `ADP` | `AdpType=circ` | circumposition right | +| `ART` | `DET` | `PronType=art` | definite or indefinite article | +| `CARD` | `NUM` | `NumType=card` | cardinal number | +| `FM` | `X` | `Foreign=yes` | foreign language material | +| `ITJ` | `INTJ` | | interjection | +| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction | +| `KON` | `CONJ` | | coordinate conjunction | +| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive | +| `KOUS` | `SCONJ` | | subordinate conjunction with sentence | +| `NE` | `PROPN` | | proper noun | +| `NNE` | `PROPN` | | proper noun | +| `NN` | `NOUN` | | noun, singular or mass | +| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb | +| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun | +| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun | +| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner | +| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun | +| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun | +| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun | +| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun | +| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun | +| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun | +| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun | +| `PTKA` | `PART` | | particle with adjective or adverb | +| `PTKANT` | `PART` | `PartType=res` | answer particle | +| `PTKNEG` | `PART` | `Negative=yes` | negative particle | +| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle | +| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive | +| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun | +| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun | +| `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun | +| `TRUNC` | `X` | `Hyph=yes` | word remnant | +| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary | +| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary | +| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary | +| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary | +| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal | +| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal | +| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal | +| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full | +| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full | +| `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full | +| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full | +| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full | +| `XY` | `X` | | non-word containing non-letter | +| `SP` | `SPACE` | | space | @@ -379,51 +379,51 @@ The German dependency labels use the [TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html) annotation scheme. -| Label | Description | -| ------ | ------------------------------- | -| `ac` | adpositional case marker | -| `adc` | adjective component | -| `ag` | genitive attribute | -| `ams` | measure argument of adjective | -| `app` | apposition | -| `avc` | adverbial phrase component | -| `cc` | comparative complement | -| `cd` | coordinating conjunction | -| `cj` | conjunct | -| `cm` | comparative conjunction | -| `cp` | complementizer | -| `cvc` | collocational verb construction | -| `da` | dative | -| `dm` | discourse marker | -| `ep` | expletive es | -| `ju` | junctor | -| `mnr` | postnominal modifier | -| `mo` | modifier | -| `ng` | negation | -| `nk` | noun kernel element | -| `nmc` | numerical component | -| `oa` | accusative object | -| `oa2` | second accusative object | -| `oc` | clausal object | -| `og` | genitive object | -| `op` | prepositional object | -| `par` | parenthetical element | -| `pd` | predicate | -| `pg` | phrasal genitive | -| `ph` | placeholder | -| `pm` | morphological particle | -| `pnc` | proper noun component | -| `punct` | punctuation | -| `rc` | relative clause | -| `re` | repeated element | -| `rs` | reported speech | -| `sb` | subject | -| `sbp` | passivized subject (PP) | -| `sp` | subject or predicate | -| `svp` | separable verb prefix | -| `uc` | unit component | -| `vo` | vocative | -| `ROOT` | root | +| Label | Description | +| ------- | ------------------------------- | +| `ac` | adpositional case marker | +| `adc` | adjective component | +| `ag` | genitive attribute | +| `ams` | measure argument of adjective | +| `app` | apposition | +| `avc` | adverbial phrase component | +| `cc` | comparative complement | +| `cd` | coordinating conjunction | +| `cj` | conjunct | +| `cm` | comparative conjunction | +| `cp` | complementizer | +| `cvc` | collocational verb construction | +| `da` | dative | +| `dm` | discourse marker | +| `ep` | expletive es | +| `ju` | junctor | +| `mnr` | postnominal modifier | +| `mo` | modifier | +| `ng` | negation | +| `nk` | noun kernel element | +| `nmc` | numerical component | +| `oa` | accusative object | +| `oa2` | second accusative object | +| `oc` | clausal object | +| `og` | genitive object | +| `op` | prepositional object | +| `par` | parenthetical element | +| `pd` | predicate | +| `pg` | phrasal genitive | +| `ph` | placeholder | +| `pm` | morphological particle | +| `pnc` | proper noun component | +| `punct` | punctuation | +| `rc` | relative clause | +| `re` | repeated element | +| `rs` | reported speech | +| `sb` | subject | +| `sbp` | passivized subject (PP) | +| `sp` | subject or predicate | +| `svp` | separable verb prefix | +| `uc` | unit component | +| `vo` | vocative | +| `ROOT` | root | diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 32e3623b0..5802bf41e 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -174,12 +174,12 @@ All output files generated by this command are compatible with -| ID | Description | -| ------------------------------ | --------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension and file content (default). | -| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | +| ID | Description | +| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension and file content (default). | +| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | +| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | ## Train {#train} @@ -291,26 +291,26 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--seed] [--n-iter] [--use-vectors] [--n-save_every] [--init-tok2vec] [--epoch-start] ``` -| Argument | Type | Description | -| ----------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------- | -| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details. | -| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | -| `output_dir` | positional | Directory to write models to on each epoch. | -| `--width`, `-cw` | option | Width of CNN layers. | -| `--depth`, `-cd` | option | Depth of CNN layers. | -| `--embed-rows`, `-er` | option | Number of embedding rows. | -| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. | -| `--dropout`, `-d` | option | Dropout rate. | -| `--batch-size`, `-bs` | option | Number of words per training batch. | -| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. | -| `--min-length`, `-nw` | option | Minimum words per example. Shorter examples are discarded. | -| `--seed`, `-s` | option | Seed for random number generators. | -| `--n-iter`, `-i` | option | Number of iterations to pretrain. | -| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. | -| `--n-save-every`, `-se` | option | Save model every X batches. | -| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.| -| `--epoch-start`, `-es` 2.1.5 | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.| -| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | +| Argument | Type | Description | +| ----------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details. | +| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | +| `output_dir` | positional | Directory to write models to on each epoch. | +| `--width`, `-cw` | option | Width of CNN layers. | +| `--depth`, `-cd` | option | Depth of CNN layers. | +| `--embed-rows`, `-er` | option | Number of embedding rows. | +| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. | +| `--dropout`, `-d` | option | Dropout rate. | +| `--batch-size`, `-bs` | option | Number of words per training batch. | +| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. | +| `--min-length`, `-nw` | option | Minimum words per example. Shorter examples are discarded. | +| `--seed`, `-s` | option | Seed for random number generators. | +| `--n-iter`, `-i` | option | Number of iterations to pretrain. | +| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. | +| `--n-save-every`, `-se` | option | Save model every X batches. | +| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | +| `--epoch-start`, `-es` 2.1.5 | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. | +| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | ### JSONL format for raw text {#pretrain-jsonl} @@ -330,10 +330,10 @@ tokenization can be provided. > srsly.write_jsonl("/path/to/text.jsonl", data) > ``` -| Key | Type | Description | -| -------- | ------- | -------------------------------------------- | +| Key | Type | Description | +| -------- | ------- | ---------------------------------------------------------- | | `text` | unicode | The raw input text. Is not required if `tokens` available. | -| `tokens` | list | Optional tokenization, one string per token. | +| `tokens` | list | Optional tokenization, one string per token. | ```json ### Example @@ -424,7 +424,7 @@ pip install dist/en_model-0.0.0.tar.gz | `input_dir` | positional | Path to directory containing model data. | | `output_dir` | positional | Directory to create package folder in. | | `--meta-path`, `-m` 2 | option | Path to `meta.json` file (optional). | -| `--create-meta`, `-c` 2 | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. -| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. | +| `--create-meta`, `-c` 2 | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. | +| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | directory | A Python package containing the spaCy model. | From 25c2b4b9a5be2779c6338c6fb148f5aea9ffe9f4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Sep 2019 14:51:44 +0200 Subject: [PATCH 3/8] Improve init-model docs (see #4137) --- website/docs/api/annotation.md | 4 ++-- website/docs/api/cli.md | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md index 71ff8d5d5..7f7b46260 100644 --- a/website/docs/api/annotation.md +++ b/website/docs/api/annotation.md @@ -584,8 +584,8 @@ data. ```python ### Entry structure { - "orth": string, - "id": int, + "orth": string, # the word text + "id": int, # can correspond to row in vectors table "lower": string, "norm": string, "shape": string diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 5802bf41e..0920e7e07 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -347,14 +347,17 @@ tokenization can be provided. Create a new model directory from raw data, like word frequencies, Brown clusters and word vectors. This command is similar to the `spacy model` command -in v1.x. +in v1.x. Note that in order to populate the model's vocab, you need to pass in a +JSONL-formatted [vocabulary file](<(/api/annotation#vocab-jsonl)>) as +`--jsonl-loc` with optional `id` values that correspond to the vectors table. +Just loading in vectors will not automatically populate the vocab. As of v2.1.0, the `--freqs-loc` and `--clusters-loc` are deprecated and have been replaced with the `--jsonl-loc` argument, which lets you pass in a a -[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one -lexical entry per line. For more details on the format, see the +[JSONL](http://jsonlines.org/) file containing one lexical entry per line. For +more details on the format, see the [annotation specs](/api/annotation#vocab-jsonl). @@ -368,7 +371,7 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] | ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | | `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted vocabulary file with lexical attributes. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | | `--vectors-loc`, `-v` | option | Optional location of vectors file. Should be a tab-separated file in Word2Vec format where the first column contains the word and the remaining columns the values. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | | `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | | **CREATES** | model | A spaCy model containing the vocab and vectors. | From f566e69f38d27d1c41233fe67307377fd73abaf5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Sep 2019 14:59:12 +0200 Subject: [PATCH 4/8] Fix --vectors-loc docs (closes #4270) --- website/docs/api/cli.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 0920e7e07..12f17f1b7 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -367,14 +367,14 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors file. Should be a tab-separated file in Word2Vec format where the first column contains the word and the remaining columns the values. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors file. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} From ee15fdfe88598b5bddf801d0d6700386efb4b7ab Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Sep 2019 14:59:42 +0200 Subject: [PATCH 5/8] Fix wording [ci skip] --- website/docs/api/cli.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 12f17f1b7..c5e77dc0d 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -367,14 +367,14 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors file. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| ----------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--prune-vectors`, `-V` | flag | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} From 875f3e5d8c09738d9abe42c2d3d952ba5b6870b0 Mon Sep 17 00:00:00 2001 From: tamuhey Date: Thu, 19 Sep 2019 04:31:27 +0900 Subject: [PATCH 6/8] remove redundant __call__ method in pipes.TextCategorizer (#4305) * remove redundant __call__ method in pipes.TextCategorizer Because the parent __call__ method behaves in the same way. * fix: Pipe.__call__ arg * fix: invalid arg in Pipe.__call__ * modified: spacy/tests/regression/test_issue4278.py (#4278) * deleted: Pipfile --- spacy/pipeline/pipes.pyx | 9 ++------- spacy/tests/regression/test_issue4278.py | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 190116a2e..2ca1801c9 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -69,7 +69,7 @@ class Pipe(object): predictions = self.predict([doc]) if isinstance(predictions, tuple) and len(predictions) == 2: scores, tensors = predictions - self.set_annotations([doc], scores, tensor=tensors) + self.set_annotations([doc], scores, tensors=tensors) else: self.set_annotations([doc], predictions) return doc @@ -90,7 +90,7 @@ class Pipe(object): predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: scores, tensors = predictions - self.set_annotations(docs, scores, tensor=tensors) + self.set_annotations(docs, scores, tensors=tensors) else: self.set_annotations(docs, predictions) yield from docs @@ -932,11 +932,6 @@ class TextCategorizer(Pipe): def labels(self, value): self.cfg["labels"] = tuple(value) - def __call__(self, doc): - scores, tensors = self.predict([doc]) - self.set_annotations([doc], scores, tensors=tensors) - return doc - def pipe(self, stream, batch_size=128, n_threads=-1): for docs in util.minibatch(stream, size=batch_size): docs = list(docs) diff --git a/spacy/tests/regression/test_issue4278.py b/spacy/tests/regression/test_issue4278.py index 4c85d15c4..cb09340ff 100644 --- a/spacy/tests/regression/test_issue4278.py +++ b/spacy/tests/regression/test_issue4278.py @@ -13,7 +13,7 @@ class DummyPipe(Pipe): def predict(self, docs): return ([1, 2, 3], [4, 5, 6]) - def set_annotations(self, docs, scores, tensor=None): + def set_annotations(self, docs, scores, tensors=None): return docs From 72463b062f06f93f7673beff0d25f744d560aa23 Mon Sep 17 00:00:00 2001 From: Moshe Hazoom Date: Wed, 18 Sep 2019 22:34:34 +0300 Subject: [PATCH 7/8] Improve speed of _merge method (#4300) * make merge more efficient * fix offsets * merge works with relative indices * remove printing * Add the SCA * fix SCA date * more cythonize _retokenize.pyx * more cythonize _retokenize.pyx * fix only declaration in _retokenize.pyx * switch back to absolute head * switch back to absolute head * fix comment * merge from origin repo --- .github/contributors/Hazoom.md | 106 +++++++++++++++++++++++++++++++++ spacy/tokens/_retokenize.pyx | 3 +- 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/Hazoom.md diff --git a/.github/contributors/Hazoom.md b/.github/contributors/Hazoom.md new file mode 100644 index 000000000..762cb5bef --- /dev/null +++ b/.github/contributors/Hazoom.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Moshe Hazoom | +| Company name (if applicable) | Amenity Analytics | +| Title or role (if applicable) | NLP Engineer | +| Date | 2019-09-15 | +| GitHub username | Hazoom | +| Website (optional) | | diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 741be7e6a..5b0747fa0 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -146,11 +146,12 @@ def _merge(Doc doc, merges): syntactic root of the span. RETURNS (Token): The first newly merged token. """ - cdef int i, merge_index, start, end, token_index + cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index cdef Span span cdef const LexemeC* lex cdef TokenC* token cdef Pool mem = Pool() + cdef int merged_iob = 0 tokens = mem.alloc(len(merges), sizeof(TokenC)) spans = [] From de5a9ecdf3e60240c78526d04aaa0931e3e825a6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 18 Sep 2019 21:37:17 +0200 Subject: [PATCH 8/8] Distinction between outside, missing and blocked NER annotations (#4307) * remove duplicate unit test * unit test (currently failing) for issue 4267 * bugfix: ensure doc.ents preserves kb_id annotations * fix in setting doc.ents with empty label * rename * test for presetting an entity to a certain type * allow overwriting Outside + blocking presets * fix actions when previous label needs to be kept * fix default ent_iob in set entities * cleaner solution with U- action * remove debugging print statements * unit tests with explicit transitions and is_valid testing * remove U- from move_names explicitly * remove unit tests with pre-trained models that don't work * remove (working) unit tests with pre-trained models * clean up unit tests * move unit tests * small fixes * remove two TODO's from doc.ents comments --- spacy/errors.py | 2 +- spacy/syntax/ner.pyx | 52 ++++--- spacy/syntax/nn_parser.pyx | 4 +- spacy/tests/doc/test_add_entities.py | 17 ++- spacy/tests/parser/test_ner.py | 157 +++++++++++++++++++-- spacy/tests/regression/test_issue1-1000.py | 2 +- spacy/tests/regression/test_issue4267.py | 42 ++++++ spacy/tokens/doc.pyx | 56 ++++---- spacy/tokens/token.pyx | 3 +- 9 files changed, 273 insertions(+), 62 deletions(-) create mode 100644 spacy/tests/regression/test_issue4267.py diff --git a/spacy/errors.py b/spacy/errors.py index 587a6e700..3b96179d7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -118,7 +118,7 @@ class Errors(object): E011 = ("Unknown operator: '{op}'. Options: {opts}") E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}") E013 = ("Error selecting action in matcher") - E014 = ("Uknown tag ID: {tag}") + E014 = ("Unknown tag ID: {tag}") E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use " "`force=True` to overwrite.") E016 = ("MultitaskObjective target should be function or one of: dep, " diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 767e4c2e0..9f8ad418c 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -66,7 +66,8 @@ cdef class BiluoPushDown(TransitionSystem): UNIT: Counter(), OUT: Counter() } - actions[OUT][''] = 1 + actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity + actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity for entity_type in kwargs.get('entity_types', []): for action in (BEGIN, IN, LAST, UNIT): actions[action][entity_type] = 1 @@ -161,8 +162,7 @@ cdef class BiluoPushDown(TransitionSystem): for i in range(self.n_moves): if self.c[i].move == move and self.c[i].label == label: return self.c[i] - else: - raise KeyError(Errors.E022.format(name=name)) + raise KeyError(Errors.E022.format(name=name)) cdef Transition init_transition(self, int clas, int move, attr_t label) except *: # TODO: Apparent Cython bug here when we try to use the Transition() @@ -266,7 +266,7 @@ cdef class Begin: return False elif label == 0: return False - elif preset_ent_iob == 1 or preset_ent_iob == 2: + elif preset_ent_iob == 1: # Ensure we don't clobber preset entities. If no entity preset, # ent_iob is 0 return False @@ -282,8 +282,8 @@ cdef class Begin: # Otherwise, force acceptance, even if we're across a sentence # boundary or the token is whitespace. return True - elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3: - # If the next word is B or O, we can't B now + elif st.B_(1).ent_iob == 3: + # If the next word is B, we can't B now return False elif st.B_(1).sent_start == 1: # Don't allow entities to extend across sentence boundaries @@ -326,6 +326,7 @@ cdef class In: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef int preset_ent_iob = st.B_(0).ent_iob + cdef attr_t preset_ent_label = st.B_(0).ent_type if label == 0: return False elif st.E_(0).ent_type != label: @@ -335,13 +336,22 @@ cdef class In: elif st.B(1) == -1: # If we're at the end, we can't I. return False - elif preset_ent_iob == 2: - return False elif preset_ent_iob == 3: return False - elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3: - # If we know the next word is B or O, we can't be I (must be L) + elif st.B_(1).ent_iob == 3: + # If we know the next word is B, we can't be I (must be L) return False + elif preset_ent_iob == 1: + if st.B_(1).ent_iob in (0, 2): + # if next preset is missing or O, this can't be I (must be L) + return False + elif label != preset_ent_label: + # If label isn't right, reject + return False + else: + # Otherwise, force acceptance, even if we're across a sentence + # boundary or the token is whitespace. + return True elif st.B(1) != -1 and st.B_(1).sent_start == 1: # Don't allow entities to extend across sentence boundaries return False @@ -387,17 +397,24 @@ cdef class In: else: return 1 - cdef class Last: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + cdef attr_t preset_ent_label = st.B_(0).ent_type if label == 0: return False elif not st.entity_is_open(): return False - elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1: + elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1: # If a preset entity has I followed by not-I, is L - return True + if label != preset_ent_label: + # If label isn't right, reject + return False + else: + # Otherwise, force acceptance, even if we're across a sentence + # boundary or the token is whitespace. + return True elif st.E_(0).ent_type != label: return False elif st.B_(1).ent_iob == 1: @@ -450,12 +467,13 @@ cdef class Unit: cdef int preset_ent_iob = st.B_(0).ent_iob cdef attr_t preset_ent_label = st.B_(0).ent_type if label == 0: - return False + # this is only allowed if it's a preset blocked annotation + if preset_ent_label == 0 and preset_ent_iob == 3: + return True + else: + return False elif st.entity_is_open(): return False - elif preset_ent_iob == 2: - # Don't clobber preset O - return False elif st.B_(1).ent_iob == 1: # If next token is In, we can't be Unit -- must be Begin return False diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 85f7b5bb9..18c45fdfc 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -135,7 +135,9 @@ cdef class Parser: names = [] for i in range(self.moves.n_moves): name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label) - names.append(name) + # Explicitly removing the internal "U-" token used for blocking entities + if name != "U-": + names.append(name) return names nr_feature = 8 diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 433541c48..374c3ddd8 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -16,10 +16,23 @@ def test_doc_add_entities_set_ents_iob(en_vocab): ner(doc) assert len(list(doc.ents)) == 0 assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) + doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] - assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] + assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] + doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] - assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""] + assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"] + + +def test_ents_reset(en_vocab): + text = ["This", "is", "a", "lion"] + doc = get_doc(en_vocab, text) + ner = EntityRecognizer(en_vocab) + ner.begin_training([]) + ner(doc) + assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) + doc.ents = list(doc.ents) + assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) def test_add_overlapping_entities(en_vocab): diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 43c00a963..65586bda1 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals import pytest -from spacy.pipeline import EntityRecognizer +from spacy.lang.en import English + +from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown from spacy.gold import GoldParse @@ -80,14 +82,145 @@ def test_get_oracle_moves_negative_O(tsys, vocab): assert names -def test_doc_add_entities_set_ents_iob(en_vocab): - doc = Doc(en_vocab, words=["This", "is", "a", "lion"]) - ner = EntityRecognizer(en_vocab) - ner.begin_training([]) - ner(doc) - assert len(list(doc.ents)) == 0 - assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) - doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] - assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] - doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] - assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""] +def test_accept_blocked_token(): + """Test succesful blocking of tokens to be in an entity.""" + # 1. test normal behaviour + nlp1 = English() + doc1 = nlp1("I live in New York") + ner1 = EntityRecognizer(doc1.vocab) + assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] + assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] + + # Add the OUT action + ner1.moves.add_action(5, "") + ner1.add_label("GPE") + # Get into the state just before "New" + state1 = ner1.moves.init_batch([doc1])[0] + ner1.moves.apply_transition(state1, "O") + ner1.moves.apply_transition(state1, "O") + ner1.moves.apply_transition(state1, "O") + # Check that B-GPE is valid. + assert ner1.moves.is_valid(state1, "B-GPE") + + # 2. test blocking behaviour + nlp2 = English() + doc2 = nlp2("I live in New York") + ner2 = EntityRecognizer(doc2.vocab) + + # set "New York" to a blocked entity + doc2.ents = [(0, 3, 5)] + assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"] + assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""] + + # Check that B-GPE is now invalid. + ner2.moves.add_action(4, "") + ner2.moves.add_action(5, "") + ner2.add_label("GPE") + state2 = ner2.moves.init_batch([doc2])[0] + ner2.moves.apply_transition(state2, "O") + ner2.moves.apply_transition(state2, "O") + ner2.moves.apply_transition(state2, "O") + # we can only use U- for "New" + assert not ner2.moves.is_valid(state2, "B-GPE") + assert ner2.moves.is_valid(state2, "U-") + ner2.moves.apply_transition(state2, "U-") + # we can only use U- for "York" + assert not ner2.moves.is_valid(state2, "B-GPE") + assert ner2.moves.is_valid(state2, "U-") + + +def test_overwrite_token(): + nlp = English() + ner1 = nlp.create_pipe("ner") + nlp.add_pipe(ner1, name="ner") + nlp.begin_training() + + # The untrained NER will predict O for each token + doc = nlp("I live in New York") + assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"] + assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] + + # Check that a new ner can overwrite O + ner2 = EntityRecognizer(doc.vocab) + ner2.moves.add_action(5, "") + ner2.add_label("GPE") + state = ner2.moves.init_batch([doc])[0] + assert ner2.moves.is_valid(state, "B-GPE") + assert ner2.moves.is_valid(state, "U-GPE") + ner2.moves.apply_transition(state, "B-GPE") + assert ner2.moves.is_valid(state, "I-GPE") + assert ner2.moves.is_valid(state, "L-GPE") + + +def test_ruler_before_ner(): + """ Test that an NER works after an entity_ruler: the second can add annotations """ + nlp = English() + + # 1 : Entity Ruler - should set "this" to B and everything else to empty + ruler = EntityRuler(nlp) + patterns = [{"label": "THING", "pattern": "This"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + # 2: untrained NER - should set everything else to O + untrained_ner = nlp.create_pipe("ner") + untrained_ner.add_label("MY_LABEL") + nlp.add_pipe(untrained_ner) + nlp.begin_training() + + doc = nlp("This is Antti Korhonen speaking in Finland") + expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] + expected_types = ["THING", "", "", "", "", "", ""] + assert [token.ent_iob_ for token in doc] == expected_iobs + assert [token.ent_type_ for token in doc] == expected_types + + +def test_ner_before_ruler(): + """ Test that an entity_ruler works after an NER: the second can overwrite O annotations """ + nlp = English() + + # 1: untrained NER - should set everything to O + untrained_ner = nlp.create_pipe("ner") + untrained_ner.add_label("MY_LABEL") + nlp.add_pipe(untrained_ner, name="uner") + nlp.begin_training() + + # 2 : Entity Ruler - should set "this" to B and keep everything else O + ruler = EntityRuler(nlp) + patterns = [{"label": "THING", "pattern": "This"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + doc = nlp("This is Antti Korhonen speaking in Finland") + expected_iobs = ["B", "O", "O", "O", "O", "O", "O"] + expected_types = ["THING", "", "", "", "", "", ""] + assert [token.ent_iob_ for token in doc] == expected_iobs + assert [token.ent_type_ for token in doc] == expected_types + + +def test_block_ner(): + """ Test functionality for blocking tokens so they can't be in a named entity """ + # block "Antti L Korhonen" from being a named entity + nlp = English() + nlp.add_pipe(BlockerComponent1(2, 5)) + untrained_ner = nlp.create_pipe("ner") + untrained_ner.add_label("MY_LABEL") + nlp.add_pipe(untrained_ner, name="uner") + nlp.begin_training() + doc = nlp("This is Antti L Korhonen speaking in Finland") + expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"] + expected_types = ["", "", "", "", "", "", "", ""] + assert [token.ent_iob_ for token in doc] == expected_iobs + assert [token.ent_type_ for token in doc] == expected_types + + +class BlockerComponent1(object): + name = "my_blocker" + + def __init__(self, start, end): + self.start = start + self.end = end + + def __call__(self, doc): + doc.ents = [(0, self.start, self.end)] + return doc diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index febf2b5b3..b3f347765 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -426,7 +426,7 @@ def test_issue957(en_tokenizer): def test_issue999(train_data): """Test that adding entities and resuming training works passably OK. There are two issues here: - 1) We have to readd labels. This isn't very nice. + 1) We have to read labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py new file mode 100644 index 000000000..5fc61e142 --- /dev/null +++ b/spacy/tests/regression/test_issue4267.py @@ -0,0 +1,42 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +import spacy + +from spacy.lang.en import English +from spacy.pipeline import EntityRuler +from spacy.tokens import Span + + +def test_issue4267(): + """ Test that running an entity_ruler after ner gives consistent results""" + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("PEOPLE") + nlp.add_pipe(ner) + nlp.begin_training() + + assert "ner" in nlp.pipe_names + + # assert that we have correct IOB annotations + doc1 = nlp("hi") + assert doc1.is_nered + for token in doc1: + assert token.ent_iob == 2 + + # add entity ruler and run again + ruler = EntityRuler(nlp) + patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] + + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + assert "entity_ruler" in nlp.pipe_names + assert "ner" in nlp.pipe_names + + # assert that we still have correct IOB annotations + doc2 = nlp("hi") + assert doc2.is_nered + for token in doc2: + assert token.ent_iob == 2 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e863b0807..9cf8e7fa5 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -256,7 +256,7 @@ cdef class Doc: def is_nered(self): """Check if the document has named entities set. Will return True if *any* of the tokens has a named entity tag set (even if the others are - uknown values). + unknown values). """ if len(self) == 0: return True @@ -525,13 +525,11 @@ cdef class Doc: def __set__(self, ents): # TODO: - # 1. Allow negative matches - # 2. Ensure pre-set NERs are not over-written during statistical - # prediction - # 3. Test basic data-driven ORTH gazetteer - # 4. Test more nuanced date and currency regex + # 1. Test basic data-driven ORTH gazetteer + # 2. Test more nuanced date and currency regex tokens_in_ents = {} cdef attr_t entity_type + cdef attr_t kb_id cdef int ent_start, ent_end for ent_info in ents: entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info) @@ -545,27 +543,31 @@ cdef class Doc: tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id) cdef int i for i in range(self.length): - self.c[i].ent_type = 0 - self.c[i].ent_kb_id = 0 - self.c[i].ent_iob = 0 # Means missing. - cdef attr_t ent_type - cdef int start, end - for ent_info in ents: - ent_type, ent_kb_id, start, end = get_entity_info(ent_info) - if ent_type is None or ent_type < 0: - # Mark as O - for i in range(start, end): - self.c[i].ent_type = 0 - self.c[i].ent_kb_id = 0 - self.c[i].ent_iob = 2 - else: - # Mark (inside) as I - for i in range(start, end): - self.c[i].ent_type = ent_type - self.c[i].ent_kb_id = ent_kb_id - self.c[i].ent_iob = 1 - # Set start as B - self.c[start].ent_iob = 3 + # default values + entity_type = 0 + kb_id = 0 + + # Set ent_iob to Missing (0) bij default unless this token was nered before + ent_iob = 0 + if self.c[i].ent_iob != 0: + ent_iob = 2 + + # overwrite if the token was part of a specified entity + if i in tokens_in_ents.keys(): + ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i] + if entity_type is None or entity_type <= 0: + # Blocking this token from being overwritten by downstream NER + ent_iob = 3 + elif ent_start == i: + # Marking the start of an entity + ent_iob = 3 + else: + # Marking the inside of an entity + ent_iob = 1 + + self.c[i].ent_type = entity_type + self.c[i].ent_kb_id = kb_id + self.c[i].ent_iob = ent_iob @property def noun_chunks(self): diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 07c6f1c99..69b9def38 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -749,7 +749,8 @@ cdef class Token: def ent_iob_(self): """IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, - and "" means no entity tag is set. + and "" means no entity tag is set. "B" with an empty ent_type + means that the token is blocked from further processing by NER. RETURNS (unicode): IOB code of named entity tag. """