Merge branch 'master' into spacy.io

2025-11-01 00:17:44 +03:00 · 2019-09-17 14:53:23 +02:00 · 2019-09-17 14:53:23 +02:00 · 5aab805c15
commit 5aab805c15
parent 237a62c5d5 25c2b4b9a5
6 changed files with 182 additions and 147 deletions
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -130,10 +130,6 @@ cdef class Parser:
    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)
    @property
    def tok2vec(self):
        return self.model.tok2vec
    @property
    def move_names(self):
        names = []
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -6,6 +6,7 @@ import pytest
 from spacy.kb import KnowledgeBase
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
 from spacy.tokens import Span
@pytest.fixture
@ -171,3 +172,31 @@ def test_preserving_links_asdoc(nlp):
        for s_ent in sent_doc.ents:
            if s_ent.text == orig_text:
                assert s_ent.kb_id_ == orig_kb_id
 def test_preserving_links_ents(nlp):
    """Test that doc.ents preserves KB annotations"""
    text = "She lives in Boston. He lives in Denver."
    doc = nlp(text)
    assert len(list(doc.ents)) == 0
    boston_ent = Span(doc, 3, 4, label="LOC", kb_id="Q1")
    doc.ents = [boston_ent]
    assert len(list(doc.ents)) == 1
    assert list(doc.ents)[0].label_ == "LOC"
    assert list(doc.ents)[0].kb_id_ == "Q1"
 def test_preserving_links_ents_2(nlp):
    """Test that doc.ents preserves KB annotations"""
    text = "She lives in Boston. He lives in Denver."
    doc = nlp(text)
    assert len(list(doc.ents)) == 0
    loc = doc.vocab.strings.add("LOC")
    q1 = doc.vocab.strings.add("Q1")
    doc.ents = [(loc, q1, 3, 4)]
    assert len(list(doc.ents)) == 1
    assert list(doc.ents)[0].label_ == "LOC"
    assert list(doc.ents)[0].kb_id_ == "Q1"
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -146,6 +146,7 @@ def _merge(Doc doc, merges):
        syntactic root of the span.
    RETURNS (Token): The first newly merged token.
    """
    cdef int i, merge_index, start, end, token_index
    cdef Span span
    cdef const LexemeC* lex
    cdef TokenC* token
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -534,7 +534,7 @@ cdef class Doc:
            cdef attr_t entity_type
            cdef int ent_start, ent_end
            for ent_info in ents:
-                entity_type, ent_start, ent_end = get_entity_info(ent_info)
+                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
                for token_index in range(ent_start, ent_end):
                    if token_index in tokens_in_ents.keys():
                        raise ValueError(Errors.E103.format(
@ -542,7 +542,7 @@ cdef class Doc:
                                   tokens_in_ents[token_index][1],
                                   self.vocab.strings[tokens_in_ents[token_index][2]]),
                            span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
-                    tokens_in_ents[token_index] = (ent_start, ent_end, entity_type)
+                    tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
            cdef int i
            for i in range(self.length):
                self.c[i].ent_type = 0
@ -551,16 +551,18 @@ cdef class Doc:
            cdef attr_t ent_type
            cdef int start, end
            for ent_info in ents:
-                ent_type, start, end = get_entity_info(ent_info)
+                ent_type, ent_kb_id, start, end = get_entity_info(ent_info)
                if ent_type is None or ent_type < 0:
                    # Mark as O
                    for i in range(start, end):
                        self.c[i].ent_type = 0
                        self.c[i].ent_kb_id = 0
                        self.c[i].ent_iob = 2
                else:
                    # Mark (inside) as I
                    for i in range(start, end):
                        self.c[i].ent_type = ent_type
                        self.c[i].ent_kb_id = ent_kb_id
                        self.c[i].ent_iob = 1
                    # Set start as B
                    self.c[start].ent_iob = 3
@ -1251,10 +1253,14 @@ def fix_attributes(doc, attributes):
 def get_entity_info(ent_info):
    if isinstance(ent_info, Span):
        ent_type = ent_info.label
        ent_kb_id = ent_info.kb_id
        start = ent_info.start
        end = ent_info.end
    elif len(ent_info) == 3:
        ent_type, start, end = ent_info
        ent_kb_id = 0
    elif len(ent_info) == 4:
        ent_type, ent_kb_id, start, end = ent_info
    else:
-        ent_id, ent_type, start, end = ent_info
+        ent_id, ent_kb_id, ent_type, start, end = ent_info
-    return ent_type, start, end
+    return ent_type, ent_kb_id, start, end
--- a/website/docs/api/annotation.md
+++ b/website/docs/api/annotation.md
@ -187,7 +187,7 @@ annotation scheme. We also map the tags to the simpler Google Universal POS tag
 set.
 | Tag       |  POS    | Morphology                               | Description                                       |
-| --------- | ------- | ------------------------------------------- | ------------------------------------------------- |
+| --------- | ------- | ---------------------------------------- | ------------------------------------------------- |
 | `$(`      | `PUNCT` | `PunctType=brck`                         | other sentence-internal punctuation mark          |
 | `$,`      | `PUNCT` | `PunctType=comm`                         | comma                                             |
 | `$.`      | `PUNCT` | `PunctType=peri`                         | sentence-final punctuation mark                   |
@ -380,7 +380,7 @@ The German dependency labels use the
 annotation scheme.
 | Label   | Description                     |
-| ------ | ------------------------------- |
+| ------- | ------------------------------- |
 | `ac`    | adpositional case marker        |
 | `adc`   | adjective component             |
 | `ag`    | genitive attribute              |
@ -584,8 +584,8 @@ data.
 ```python
 ### Entry structure
 {
-    "orth": string,
+    "orth": string,     # the word text
-    "id": int,
+    "id": int,          # can correspond to row in vectors table
    "lower": string,
    "norm": string,
    "shape": string
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -175,7 +175,7 @@ All output files generated by this command are compatible with
 <!-- TODO: document jsonl option – maybe update it? -->
 | ID                             | Description                                                                                                                                                                                                                                                                                                                                                                                    |
-| ------------------------------ | --------------------------------------------------------------- |
+| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `auto`                         | Automatically pick converter based on file extension and file content (default).                                                                                                                                                                                                                                                                                                               |
 | `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format.                                                                                                                                                                                                                                                                                                                                           |
 | `ner`                          | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
@ -292,7 +292,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
 ```
 | Argument                                              | Type       | Description                                                                                                                                                                     |
-| ----------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `texts_loc`                                           | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](#pretrain-jsonl) for details.               |
 | `vectors_model`                                       | positional | Name or path to spaCy model with vectors to learn from.                                                                                                                         |
 | `output_dir`                                          | positional | Directory to write models to on each epoch.                                                                                                                                     |
@ -308,8 +308,8 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
 | `--n-iter`, `-i`                                      | option     | Number of iterations to pretrain.                                                                                                                                               |
 | `--use-vectors`, `-uv`                                | flag       | Whether to use the static vectors as input features.                                                                                                                            |
 | `--n-save-every`, `-se`                               | option     | Save model every X batches.                                                                                                                                                     |
-| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.|
+| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option     | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.                                                                     |
-| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.|
+| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option     | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. |
 | **CREATES**                                           | weights    | The pre-trained weights that can be used to initialize `spacy train`.                                                                                                           |
 ### JSONL format for raw text {#pretrain-jsonl}
@ -331,7 +331,7 @@ tokenization can be provided.
 > ```
 | Key      | Type    | Description                                                |
-| -------- | ------- | -------------------------------------------- |
+| -------- | ------- | ---------------------------------------------------------- |
 | `text`   | unicode | The raw input text. Is not required if `tokens` available. |
 | `tokens` | list    | Optional tokenization, one string per token.               |
@ -347,14 +347,17 @@ tokenization can be provided.
 Create a new model directory from raw data, like word frequencies, Brown
 clusters and word vectors. This command is similar to the `spacy model` command
-in v1.x.
+in v1.x. Note that in order to populate the model's vocab, you need to pass in a
 JSONL-formatted [vocabulary file](<(/api/annotation#vocab-jsonl)>) as
 `--jsonl-loc` with optional `id` values that correspond to the vectors table.
 Just loading in vectors will not automatically populate the vocab.
 <Infobox title="Deprecation note" variant="warning">
 As of v2.1.0, the `--freqs-loc` and `--clusters-loc` are deprecated and have
 been replaced with the `--jsonl-loc` argument, which lets you pass in a a
-[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
+[JSONL](http://jsonlines.org/) file containing one lexical entry per line. For
-lexical entry per line. For more details on the format, see the
+more details on the format, see the
 [annotation specs](/api/annotation#vocab-jsonl).
 </Infobox>
@ -368,7 +371,7 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
 | ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `lang`                  | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`.                                                                                                                                                                      |
 | `output_dir`            | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                                      |
-| `--jsonl-loc`, `-j`     | option     | Optional location of JSONL-formatted vocabulary file with lexical attributes.                                                                                                                                                                                     |
+| `--jsonl-loc`, `-j`     | option     | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes.                                                                                                                                                      |
 | `--vectors-loc`, `-v`   | option     | Optional location of vectors file. Should be a tab-separated file in Word2Vec format where the first column contains the word and the remaining columns the values. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
 | `--prune-vectors`, `-V` | flag       | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                                    |
 | **CREATES**             | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                                   |
@ -424,7 +427,7 @@ pip install dist/en_model-0.0.0.tar.gz
 | `input_dir`                                      | positional | Path to directory containing model data.                                                                                                                                                        |
 | `output_dir`                                     | positional | Directory to create package folder in.                                                                                                                                                          |
 | `--meta-path`, `-m` <Tag variant="new">2</Tag>   | option     | Path to `meta.json` file (optional).                                                                                                                                                            |
-| `--create-meta`, `-c` <Tag variant="new">2</Tag> | flag       | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt.
+| `--create-meta`, `-c` <Tag variant="new">2</Tag> | flag       | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. |
 | `--force`, `-f`                                  | flag       | Force overwriting of existing folder in output directory.                                                                                                                                       |
 | `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                                                                                                                      |
 | **CREATES**                                      | directory  | A Python package containing the spaCy model.                                                                                                                                                    |