Merge branch 'develop' into spacy.io

2025-07-26 16:09:47 +03:00 · 2019-03-04 16:30:27 +00:00 · 2019-03-04 16:30:27 +00:00 · 21ad01259c
commit 21ad01259c
parent b328f00cd6 5eadf61327
18 changed files with 144 additions and 48 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -4,13 +4,13 @@
 # fmt: off

 __title__ = "spacy-nightly"
-__version__ = "2.1.0a9.dev2"
+__version__ = "2.1.0a10"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"
 __email__ = "contact@explosion.ai"
 __license__ = "MIT"
-__release__ = False
+__release__ = True

 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -456,12 +456,16 @@ cdef class GoldParse:
        if deps is None:
            deps = [None for _ in doc]
        if entities is None:
-            entities = [None for _ in doc]
+            entities = ['-' for _ in doc]
        elif len(entities) == 0:
            entities = ['O' for _ in doc]
-        elif not isinstance(entities[0], basestring):
-            # Assume we have entities specified by character offset.
-            entities = biluo_tags_from_offsets(doc, entities)
+        else:
+            # Translate the None values to '-', to make processing easier.
+            # See Issue #2603
+            entities = [(ent if ent is not None else '-') for ent in entities]
+            if not isinstance(entities[0], basestring):
+                # Assume we have entities specified by character offset.
+                entities = biluo_tags_from_offsets(doc, entities)

        self.mem = Pool()
        self.loss = 0
--- a/spacy/lang/ta/lex_attrs.py
+++ b/spacy/lang/ta/lex_attrs.py
@ -74,12 +74,10 @@ def like_num(text):
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
-    print(suffix_filter(text))
    if text.lower() in _num_words:
        return True
    elif suffix_filter(text) in _num_words:
        return True
-
    return False


--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -15,7 +15,7 @@ class UkrainianLemmatizer(Lemmatizer):
                UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
        except (ImportError, TypeError):
            raise ImportError(
-                "The Ukrainian lemmatizer requires the pymorphy2 library and
+                "The Ukrainian lemmatizer requires the pymorphy2 library and "
                'dictionaries: try to fix it with "pip uninstall pymorphy2" and'
                '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
            )
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -60,9 +60,10 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
    for i in range(doc.length):
        for j in range(n):
            states.push_back(PatternStateC(patterns[j], i, 0))
-        transition_states(states, matches, &predicate_cache[i],
+        transition_states(states, matches, predicate_cache,
            doc[i], extra_attr_values, predicates)
        extra_attr_values += nr_extra_attr
+        predicate_cache += len(predicates)
    # Handle matches that end in 0-width patterns
    finish_states(matches, states)
    output = []
@ -105,7 +106,6 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
        if states[i].pattern.nr_py >= 1:
            update_predicate_cache(cached_py_predicates,
                states[i].pattern, token, py_predicates)
-    for i in range(states.size()):
        action = get_action(states[i], token.c, extra_attrs,
                            cached_py_predicates)
        if action == REJECT:
@ -127,6 +127,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
                    PatternStateC(pattern=state.pattern+1, start=state.start,
                                  length=state.length+1))
            states[q].pattern += 1
+
            if states[q].pattern.nr_py != 0:
                update_predicate_cache(cached_py_predicates,
                    states[q].pattern, token, py_predicates)
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -141,7 +141,7 @@ cdef class BiluoPushDown(TransitionSystem):

    cdef Transition lookup_transition(self, object name) except *:
        cdef attr_t label
-        if name == '-' or name is None:
+        if name == '-' or name == '' or name is None:
            return Transition(clas=0, move=MISSING, label=0, score=0)
        elif name == '!O':
            return Transition(clas=0, move=ISNT, label=0, score=0)
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -192,3 +192,10 @@ def test_tokens_sent(doc):
    assert doc[7].sent.text == "This is another sentence ."
    assert doc[1].sent.root.left_edge.text == "This"
    assert doc[7].sent.root.left_edge.text == "This"
+
+
+def test_token0_has_sent_start_true():
+    doc = Doc(Vocab(), words=["hello", "world"])
+    assert doc[0].is_sent_start is True
+    assert doc[1].is_sent_start is None
+    assert not doc.is_sentenced
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -16,6 +16,10 @@ LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",


@pytest.mark.parametrize("lang", LANGUAGES)
-def test_lang_initialize(lang):
+def test_lang_initialize(lang, capfd):
    """Test that languages can be initialized."""
-    lang_cls = get_lang_class(lang)()  # noqa: F841
+    nlp = get_lang_class(lang)()  # noqa: F841
+    # Check for stray print statements (see #3342)
+    doc = nlp("test")
+    captured = capfd.readouterr()
+    assert not captured.out
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import pytest
 import re
 from spacy.matcher import Matcher
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span


 pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}]
@ -129,3 +129,17 @@ def test_matcher_end_zero_plus(en_vocab):
    assert len(matcher(nlp("a b c"))) == 2
    assert len(matcher(nlp("a b b c"))) == 3
    assert len(matcher(nlp("a b b"))) == 3
+
+
+def test_matcher_sets_return_correct_tokens(en_vocab):
+    matcher = Matcher(en_vocab)
+    patterns = [
+        [{'LOWER': {'IN': ["zero"]}}],
+        [{'LOWER': {'IN': ["one"]}}],
+        [{'LOWER': {'IN': ["two"]}}],
+    ]
+    matcher.add('TEST', None, *patterns)
+    doc = Doc(en_vocab, words="zero one two three".split())
+    matches = matcher(doc)
+    texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
+    assert texts == ['zero', 'one', 'two']
--- a/spacy/tests/regression/test_issue3328.py
+++ b/spacy/tests/regression/test_issue3328.py
@ -1,12 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals

-import pytest
 from spacy.matcher import Matcher
 from spacy.tokens import Doc


-@pytest.mark.xfail
 def test_issue3328(en_vocab):
    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
    matcher = Matcher(en_vocab)
--- a/spacy/tests/regression/test_issue3331.py
+++ b/spacy/tests/regression/test_issue3331.py
@ -0,0 +1,21 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.matcher import PhraseMatcher
+from spacy.tokens import Doc
+
+
+@pytest.mark.xfail
+def test_issue3331(en_vocab):
+    """Test that duplicate patterns for different rules result in multiple
+    matches, one per rule.
+    """
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"]))
+    matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"]))
+    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
+    matches = matcher(doc)
+    assert len(matches) == 2
+    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
+    assert sorted(match_ids) == ["A", "B"]
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -2,7 +2,7 @@
 from __future__ import unicode_literals

 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
-from spacy.gold import spans_from_biluo_tags
+from spacy.gold import spans_from_biluo_tags, GoldParse
 from spacy.tokens import Doc


@ -62,3 +62,9 @@ def test_biluo_spans(en_tokenizer):
    assert spans[0].label_ == "LOC"
    assert spans[1].text == "London"
    assert spans[1].label_ == "GPE"
+
+
+def test_gold_ner_missing_tags(en_tokenizer):
+    doc = en_tokenizer("I flew to Silicon Valley via London.")
+    biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
+    gold = GoldParse(doc, entities=biluo_tags)  # noqa: F841
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -188,13 +188,18 @@ cdef class Doc:

    @property
    def is_sentenced(self):
-        # Check if the document has sentence boundaries,
-        # i.e at least one tok has the sent_start in (-1, 1)
+        """Check if the document has sentence boundaries assigned. This is
+        defined as having at least one of the following:
+
+        a) An entry "sents" in doc.user_hooks";
+        b) sent.is_parsed is set to True;
+        c) At least one token other than the first where sent_start is not None.
+        """
        if 'sents' in self.user_hooks:
            return True
        if self.is_parsed:
            return True
-        for i in range(self.length):
+        for i in range(1, self.length):
            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
                return True
        else:
@ -569,6 +574,9 @@ cdef class Doc:
            raise ValueError(Errors.E031.format(i=self.length))
        t.spacy = has_space
        self.length += 1
+        if self.length == 1:
+            # Set token.sent_start to 1 for first token. See issue #2869
+            self.c[0].sent_start = 1
        return t.idx + t.lex.length + t.spacy

    @cython.boundscheck(False)
@ -1000,8 +1008,10 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
        tokens[i].r_kids = 0
        tokens[i].l_edge = i
        tokens[i].r_edge = i
-    # Twice, for non-projectivity
-    for loop_count in range(2):
+    # Three times, for non-projectivity
+    # See issue #3170. This isn't a very satisfying fix, but I think it's
+    # sufficient.
+    for loop_count in range(3):
        # Set left edges
        for i in range(length):
            child = &tokens[i]
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -262,19 +262,49 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width]
 [--depth] [--embed-rows] [--dropout] [--seed] [--n-iter] [--use-vectors]
 ```

-| Argument               | Type       | Description                                                           |
-| ---------------------- | ---------- | --------------------------------------------------------------------- |
-| `texts_loc`            | positional | Path to JSONL file with raw texts to learn from.                      |
-| `vectors_model`        | positional | Name or path to spaCy model with vectors to learn from.               |
-| `output_dir`           | positional | Directory to write models to on each epoch.                           |
-| `--width`, `-cw`       | option     | Width of CNN layers.                                                  |
-| `--depth`, `-cd`       | option     | Depth of CNN layers.                                                  |
-| `--embed-rows`, `-er`  | option     | Number of embedding rows.                                             |
-| `--dropout`, `-d`      | option     | Dropout rate.                                                         |
-| `--seed`, `-s`         | option     | Seed for random number generators.                                    |
-| `--n-iter`, `-i`       | option     | Number of iterations to pretrain.                                     |
-| `--use-vectors`, `-uv` | flag       | Whether to use the static vectors as input features.                  |
-| **CREATES**            | weights    | The pre-trained weights that can be used to initialize `spacy train`. |
+| Argument               | Type       | Description                                                                                                                       |
+| ---------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------- |
+| `texts_loc`            | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"`. [See here](#pretrain-jsonl) for details. |
+| `vectors_model`        | positional | Name or path to spaCy model with vectors to learn from.                                                                           |
+| `output_dir`           | positional | Directory to write models to on each epoch.                                                                                       |
+| `--width`, `-cw`       | option     | Width of CNN layers.                                                                                                              |
+| `--depth`, `-cd`       | option     | Depth of CNN layers.                                                                                                              |
+| `--embed-rows`, `-er`  | option     | Number of embedding rows.                                                                                                         |
+| `--dropout`, `-d`      | option     | Dropout rate.                                                                                                                     |
+| `--seed`, `-s`         | option     | Seed for random number generators.                                                                                                |
+| `--n-iter`, `-i`       | option     | Number of iterations to pretrain.                                                                                                 |
+| `--use-vectors`, `-uv` | flag       | Whether to use the static vectors as input features.                                                                              |
+| **CREATES**            | weights    | The pre-trained weights that can be used to initialize `spacy train`.                                                             |
+
+### JSONL format for raw text {#pretrain-jsonl}
+
+Raw text can be provided as a `.jsonl` (newline-delimited JSON) file containing
+one input text per line (roughly paragraph length is good). Optionally, custom
+tokenization can be provided.
+
+> #### Tip: Writing JSONL
+>
+> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a
+> handy `write_jsonl` helper that takes a file path and list of dictionaries and
+> writes out JSONL-formatted data.
+>
+> ```python
+> import srsly
+> data = [{"text": "Some text"}, {"text": "More..."}]
+> srsly.write_jsonl("/path/to/text.jsonl", data)
+> ```
+
+| Key      | Type    | Description                                  |
+| -------- | ------- | -------------------------------------------- |
+| `text`   | unicode | The raw input text.                          |
+| `tokens` | list    | Optional tokenization, one string per token. |
+
+```json
+### Example
+{"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
+{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
+{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
+```

 ## Init Model {#init-model new="2"}

--- a/website/docs/api/goldparse.md
+++ b/website/docs/api/goldparse.md
@ -14,16 +14,16 @@ examples of a label to have the value `1.0`, and negative examples of a label to
 have the value `0.0`. Labels not in the dictionary are treated as missing – the
 gradient for those labels will be zero.

-| Name        | Type        | Description                                                                                                                                                                                                               |
-| ----------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doc`       | `Doc`       | The document the annotations refer to.                                                                                                                                                                                    |
-| `words`     | iterable    | A sequence of unicode word strings.                                                                                                                                                                                       |
-| `tags`      | iterable    | A sequence of strings, representing tag annotations.                                                                                                                                                                      |
-| `heads`     | iterable    | A sequence of integers, representing syntactic head offsets.                                                                                                                                                              |
-| `deps`      | iterable    | A sequence of strings, representing the syntactic relation types.                                                                                                                                                         |
-| `entities`  | iterable    | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions.                                                                     |
-| `cats`      | dict        | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). |
-| **RETURNS** | `GoldParse` | The newly constructed object.                                                                                                                                                                                             |
+| Name        | Type        | Description                                                                                                                                                                                                                            |
+| ----------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doc`       | `Doc`       | The document the annotations refer to.                                                                                                                                                                                                 |
+| `words`     | iterable    | A sequence of unicode word strings.                                                                                                                                                                                                    |
+| `tags`      | iterable    | A sequence of strings, representing tag annotations.                                                                                                                                                                                   |
+| `heads`     | iterable    | A sequence of integers, representing syntactic head offsets.                                                                                                                                                                           |
+| `deps`      | iterable    | A sequence of strings, representing the syntactic relation types.                                                                                                                                                                      |
+| `entities`  | iterable    | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. |
+| `cats`      | dict        | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence).              |
+| **RETURNS** | `GoldParse` | The newly constructed object.                                                                                                                                                                                                          |

 ## GoldParse.\_\_len\_\_ {#len tag="method"}

--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -324,7 +324,7 @@ A sequence containing the token and all the token's syntactic descendants.
 ## Token.is_sent_start {#is_sent_start tag="property" new="2"}

 A boolean value indicating whether the token starts a sentence. `None` if
-unknown.
+unknown. Defaults to `True` for the first token in the `doc`.

 > #### Example
 >
--- a/website/docs/usage/v2-1.md
+++ b/website/docs/usage/v2-1.md
@ -249,6 +249,9 @@ if all of your models are up to date, you can run the
  + sentence_splitter = nlp.create_pipe("sentencizer")
  ```

+- The `is_sent_start` attribute of the first token in a `Doc` now correctly
+  defaults to `True`. It previously defaulted to `None`.
+
 - The keyword argument `n_threads` on the `.pipe` methods is now deprecated, as
  the v2.x models cannot release the global interpreter lock. (Future versions
  may introduce a `n_process` argument for parallel inference via
--- a/website/src/html.js
+++ b/website/src/html.js
@ -11,11 +11,11 @@ export default function HTML(props) {
                    name="viewport"
                    content="width=device-width, initial-scale=1, shrink-to-fit=no"
                />
-                {props.headComponents}
                <link
                    rel="stylesheet"
                    href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css"
                />
+                {props.headComponents}
            </head>
            <body {...props.bodyAttributes}>
                {props.preBodyComponents}