diff --git a/spacy/about.py b/spacy/about.py index 4390529fa..a4ee29189 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,13 +4,13 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "2.1.0a9.dev2" +__version__ = "2.1.0a10" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" __email__ = "contact@explosion.ai" __license__ = "MIT" -__release__ = False +__release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 91204f671..756e6a5fa 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -456,12 +456,16 @@ cdef class GoldParse: if deps is None: deps = [None for _ in doc] if entities is None: - entities = [None for _ in doc] + entities = ['-' for _ in doc] elif len(entities) == 0: entities = ['O' for _ in doc] - elif not isinstance(entities[0], basestring): - # Assume we have entities specified by character offset. - entities = biluo_tags_from_offsets(doc, entities) + else: + # Translate the None values to '-', to make processing easier. + # See Issue #2603 + entities = [(ent if ent is not None else '-') for ent in entities] + if not isinstance(entities[0], basestring): + # Assume we have entities specified by character offset. + entities = biluo_tags_from_offsets(doc, entities) self.mem = Pool() self.loss = 0 diff --git a/spacy/lang/ta/lex_attrs.py b/spacy/lang/ta/lex_attrs.py index 87def6162..40158ad7a 100644 --- a/spacy/lang/ta/lex_attrs.py +++ b/spacy/lang/ta/lex_attrs.py @@ -74,12 +74,10 @@ def like_num(text): num, denom = text.split("/") if num.isdigit() and denom.isdigit(): return True - print(suffix_filter(text)) if text.lower() in _num_words: return True elif suffix_filter(text) in _num_words: return True - return False diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index f41f36fe2..ab56c824d 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -15,7 +15,7 @@ class UkrainianLemmatizer(Lemmatizer): UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") except (ImportError, TypeError): raise ImportError( - "The Ukrainian lemmatizer requires the pymorphy2 library and + "The Ukrainian lemmatizer requires the pymorphy2 library and " 'dictionaries: try to fix it with "pip uninstall pymorphy2" and' '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"' ) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 5f17ec867..17b0a4a36 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -60,9 +60,10 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, for i in range(doc.length): for j in range(n): states.push_back(PatternStateC(patterns[j], i, 0)) - transition_states(states, matches, &predicate_cache[i], + transition_states(states, matches, predicate_cache, doc[i], extra_attr_values, predicates) extra_attr_values += nr_extra_attr + predicate_cache += len(predicates) # Handle matches that end in 0-width patterns finish_states(matches, states) output = [] @@ -105,7 +106,6 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match if states[i].pattern.nr_py >= 1: update_predicate_cache(cached_py_predicates, states[i].pattern, token, py_predicates) - for i in range(states.size()): action = get_action(states[i], token.c, extra_attrs, cached_py_predicates) if action == REJECT: @@ -127,6 +127,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match PatternStateC(pattern=state.pattern+1, start=state.start, length=state.length+1)) states[q].pattern += 1 + if states[q].pattern.nr_py != 0: update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates) diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 7a1d34c00..b43a879d4 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -141,7 +141,7 @@ cdef class BiluoPushDown(TransitionSystem): cdef Transition lookup_transition(self, object name) except *: cdef attr_t label - if name == '-' or name is None: + if name == '-' or name == '' or name is None: return Transition(clas=0, move=MISSING, label=0, score=0) elif name == '!O': return Transition(clas=0, move=ISNT, label=0, score=0) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 5713c5c07..3dd9935b2 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -192,3 +192,10 @@ def test_tokens_sent(doc): assert doc[7].sent.text == "This is another sentence ." assert doc[1].sent.root.left_edge.text == "This" assert doc[7].sent.root.left_edge.text == "This" + + +def test_token0_has_sent_start_true(): + doc = Doc(Vocab(), words=["hello", "world"]) + assert doc[0].is_sent_start is True + assert doc[1].is_sent_start is None + assert not doc.is_sentenced diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 7b303397e..4a01ba50a 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -16,6 +16,10 @@ LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", @pytest.mark.parametrize("lang", LANGUAGES) -def test_lang_initialize(lang): +def test_lang_initialize(lang, capfd): """Test that languages can be initialized.""" - lang_cls = get_lang_class(lang)() # noqa: F841 + nlp = get_lang_class(lang)() # noqa: F841 + # Check for stray print statements (see #3342) + doc = nlp("test") + captured = capfd.readouterr() + assert not captured.out diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 56a03d200..f103dda8a 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest import re from spacy.matcher import Matcher -from spacy.tokens import Doc +from spacy.tokens import Doc, Span pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}] @@ -129,3 +129,17 @@ def test_matcher_end_zero_plus(en_vocab): assert len(matcher(nlp("a b c"))) == 2 assert len(matcher(nlp("a b b c"))) == 3 assert len(matcher(nlp("a b b"))) == 3 + + +def test_matcher_sets_return_correct_tokens(en_vocab): + matcher = Matcher(en_vocab) + patterns = [ + [{'LOWER': {'IN': ["zero"]}}], + [{'LOWER': {'IN': ["one"]}}], + [{'LOWER': {'IN': ["two"]}}], + ] + matcher.add('TEST', None, *patterns) + doc = Doc(en_vocab, words="zero one two three".split()) + matches = matcher(doc) + texts = [Span(doc, s, e, label=L).text for L, s, e in matches] + assert texts == ['zero', 'one', 'two'] diff --git a/spacy/tests/regression/test_issue3328.py b/spacy/tests/regression/test_issue3328.py index fce25ca1c..c397feebb 100644 --- a/spacy/tests/regression/test_issue3328.py +++ b/spacy/tests/regression/test_issue3328.py @@ -1,12 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import pytest from spacy.matcher import Matcher from spacy.tokens import Doc -@pytest.mark.xfail def test_issue3328(en_vocab): doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) matcher = Matcher(en_vocab) diff --git a/spacy/tests/regression/test_issue3331.py b/spacy/tests/regression/test_issue3331.py new file mode 100644 index 000000000..c30712f81 --- /dev/null +++ b/spacy/tests/regression/test_issue3331.py @@ -0,0 +1,21 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.matcher import PhraseMatcher +from spacy.tokens import Doc + + +@pytest.mark.xfail +def test_issue3331(en_vocab): + """Test that duplicate patterns for different rules result in multiple + matches, one per rule. + """ + matcher = PhraseMatcher(en_vocab) + matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"])) + matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"])) + doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) + matches = matcher(doc) + assert len(matches) == 2 + match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] + assert sorted(match_ids) == ["A", "B"] diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 30dd2e6c6..d370eac53 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags -from spacy.gold import spans_from_biluo_tags +from spacy.gold import spans_from_biluo_tags, GoldParse from spacy.tokens import Doc @@ -62,3 +62,9 @@ def test_biluo_spans(en_tokenizer): assert spans[0].label_ == "LOC" assert spans[1].text == "London" assert spans[1].label_ == "GPE" + + +def test_gold_ner_missing_tags(en_tokenizer): + doc = en_tokenizer("I flew to Silicon Valley via London.") + biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] + gold = GoldParse(doc, entities=biluo_tags) # noqa: F841 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ba0801e34..97ac10f76 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -188,13 +188,18 @@ cdef class Doc: @property def is_sentenced(self): - # Check if the document has sentence boundaries, - # i.e at least one tok has the sent_start in (-1, 1) + """Check if the document has sentence boundaries assigned. This is + defined as having at least one of the following: + + a) An entry "sents" in doc.user_hooks"; + b) sent.is_parsed is set to True; + c) At least one token other than the first where sent_start is not None. + """ if 'sents' in self.user_hooks: return True if self.is_parsed: return True - for i in range(self.length): + for i in range(1, self.length): if self.c[i].sent_start == -1 or self.c[i].sent_start == 1: return True else: @@ -569,6 +574,9 @@ cdef class Doc: raise ValueError(Errors.E031.format(i=self.length)) t.spacy = has_space self.length += 1 + if self.length == 1: + # Set token.sent_start to 1 for first token. See issue #2869 + self.c[0].sent_start = 1 return t.idx + t.lex.length + t.spacy @cython.boundscheck(False) @@ -1000,8 +1008,10 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: tokens[i].r_kids = 0 tokens[i].l_edge = i tokens[i].r_edge = i - # Twice, for non-projectivity - for loop_count in range(2): + # Three times, for non-projectivity + # See issue #3170. This isn't a very satisfying fix, but I think it's + # sufficient. + for loop_count in range(3): # Set left edges for i in range(length): child = &tokens[i] diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index f30096fa1..2fc7a9e9a 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -262,19 +262,49 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width] [--depth] [--embed-rows] [--dropout] [--seed] [--n-iter] [--use-vectors] ``` -| Argument | Type | Description | -| ---------------------- | ---------- | --------------------------------------------------------------------- | -| `texts_loc` | positional | Path to JSONL file with raw texts to learn from. | -| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | -| `output_dir` | positional | Directory to write models to on each epoch. | -| `--width`, `-cw` | option | Width of CNN layers. | -| `--depth`, `-cd` | option | Depth of CNN layers. | -| `--embed-rows`, `-er` | option | Number of embedding rows. | -| `--dropout`, `-d` | option | Dropout rate. | -| `--seed`, `-s` | option | Seed for random number generators. | -| `--n-iter`, `-i` | option | Number of iterations to pretrain. | -| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. | -| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | +| Argument | Type | Description | +| ---------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------- | +| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"`. [See here](#pretrain-jsonl) for details. | +| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | +| `output_dir` | positional | Directory to write models to on each epoch. | +| `--width`, `-cw` | option | Width of CNN layers. | +| `--depth`, `-cd` | option | Depth of CNN layers. | +| `--embed-rows`, `-er` | option | Number of embedding rows. | +| `--dropout`, `-d` | option | Dropout rate. | +| `--seed`, `-s` | option | Seed for random number generators. | +| `--n-iter`, `-i` | option | Number of iterations to pretrain. | +| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. | +| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | + +### JSONL format for raw text {#pretrain-jsonl} + +Raw text can be provided as a `.jsonl` (newline-delimited JSON) file containing +one input text per line (roughly paragraph length is good). Optionally, custom +tokenization can be provided. + +> #### Tip: Writing JSONL +> +> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a +> handy `write_jsonl` helper that takes a file path and list of dictionaries and +> writes out JSONL-formatted data. +> +> ```python +> import srsly +> data = [{"text": "Some text"}, {"text": "More..."}] +> srsly.write_jsonl("/path/to/text.jsonl", data) +> ``` + +| Key | Type | Description | +| -------- | ------- | -------------------------------------------- | +| `text` | unicode | The raw input text. | +| `tokens` | list | Optional tokenization, one string per token. | + +```json +### Example +{"text": "Can I ask where you work now and what you do, and if you enjoy it?"} +{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."} +{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."} +``` ## Init Model {#init-model new="2"} diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 1f71c5d58..ca5b6a811 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -14,16 +14,16 @@ examples of a label to have the value `1.0`, and negative examples of a label to have the value `0.0`. Labels not in the dictionary are treated as missing – the gradient for those labels will be zero. -| Name | Type | Description | -| ----------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document the annotations refer to. | -| `words` | iterable | A sequence of unicode word strings. | -| `tags` | iterable | A sequence of strings, representing tag annotations. | -| `heads` | iterable | A sequence of integers, representing syntactic head offsets. | -| `deps` | iterable | A sequence of strings, representing the syntactic relation types. | -| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. | -| `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). | -| **RETURNS** | `GoldParse` | The newly constructed object. | +| Name | Type | Description | +| ----------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document the annotations refer to. | +| `words` | iterable | A sequence of unicode word strings. | +| `tags` | iterable | A sequence of strings, representing tag annotations. | +| `heads` | iterable | A sequence of integers, representing syntactic head offsets. | +| `deps` | iterable | A sequence of strings, representing the syntactic relation types. | +| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | +| `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). | +| **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 5c22eaae3..1089d2329 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -324,7 +324,7 @@ A sequence containing the token and all the token's syntactic descendants. ## Token.is_sent_start {#is_sent_start tag="property" new="2"} A boolean value indicating whether the token starts a sentence. `None` if -unknown. +unknown. Defaults to `True` for the first token in the `doc`. > #### Example > diff --git a/website/docs/usage/v2-1.md b/website/docs/usage/v2-1.md index be3159a56..f97d9d283 100644 --- a/website/docs/usage/v2-1.md +++ b/website/docs/usage/v2-1.md @@ -249,6 +249,9 @@ if all of your models are up to date, you can run the + sentence_splitter = nlp.create_pipe("sentencizer") ``` +- The `is_sent_start` attribute of the first token in a `Doc` now correctly + defaults to `True`. It previously defaulted to `None`. + - The keyword argument `n_threads` on the `.pipe` methods is now deprecated, as the v2.x models cannot release the global interpreter lock. (Future versions may introduce a `n_process` argument for parallel inference via diff --git a/website/src/html.js b/website/src/html.js index 53e50bc8a..6094a2e55 100644 --- a/website/src/html.js +++ b/website/src/html.js @@ -11,11 +11,11 @@ export default function HTML(props) { name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" /> - {props.headComponents} + {props.headComponents} {props.preBodyComponents}