Merge branch 'develop' into spacy.io

This commit is contained in:
Ines Montani 2019-03-04 16:30:27 +00:00
commit 21ad01259c
18 changed files with 144 additions and 48 deletions

View File

@ -4,13 +4,13 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "2.1.0a9.dev2"
__version__ = "2.1.0a10"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io"
__author__ = "Explosion AI"
__email__ = "contact@explosion.ai"
__license__ = "MIT"
__release__ = False
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -456,12 +456,16 @@ cdef class GoldParse:
if deps is None:
deps = [None for _ in doc]
if entities is None:
entities = [None for _ in doc]
entities = ['-' for _ in doc]
elif len(entities) == 0:
entities = ['O' for _ in doc]
elif not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
else:
# Translate the None values to '-', to make processing easier.
# See Issue #2603
entities = [(ent if ent is not None else '-') for ent in entities]
if not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
self.mem = Pool()
self.loss = 0

View File

@ -74,12 +74,10 @@ def like_num(text):
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
print(suffix_filter(text))
if text.lower() in _num_words:
return True
elif suffix_filter(text) in _num_words:
return True
return False

View File

@ -15,7 +15,7 @@ class UkrainianLemmatizer(Lemmatizer):
UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
except (ImportError, TypeError):
raise ImportError(
"The Ukrainian lemmatizer requires the pymorphy2 library and
"The Ukrainian lemmatizer requires the pymorphy2 library and "
'dictionaries: try to fix it with "pip uninstall pymorphy2" and'
'"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
)

View File

@ -60,9 +60,10 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
for i in range(doc.length):
for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0))
transition_states(states, matches, &predicate_cache[i],
transition_states(states, matches, predicate_cache,
doc[i], extra_attr_values, predicates)
extra_attr_values += nr_extra_attr
predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns
finish_states(matches, states)
output = []
@ -105,7 +106,6 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
if states[i].pattern.nr_py >= 1:
update_predicate_cache(cached_py_predicates,
states[i].pattern, token, py_predicates)
for i in range(states.size()):
action = get_action(states[i], token.c, extra_attrs,
cached_py_predicates)
if action == REJECT:
@ -127,6 +127,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
PatternStateC(pattern=state.pattern+1, start=state.start,
length=state.length+1))
states[q].pattern += 1
if states[q].pattern.nr_py != 0:
update_predicate_cache(cached_py_predicates,
states[q].pattern, token, py_predicates)

View File

@ -141,7 +141,7 @@ cdef class BiluoPushDown(TransitionSystem):
cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name is None:
if name == '-' or name == '' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0)
elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0)

View File

@ -192,3 +192,10 @@ def test_tokens_sent(doc):
assert doc[7].sent.text == "This is another sentence ."
assert doc[1].sent.root.left_edge.text == "This"
assert doc[7].sent.root.left_edge.text == "This"
def test_token0_has_sent_start_true():
doc = Doc(Vocab(), words=["hello", "world"])
assert doc[0].is_sent_start is True
assert doc[1].is_sent_start is None
assert not doc.is_sentenced

View File

@ -16,6 +16,10 @@ LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
@pytest.mark.parametrize("lang", LANGUAGES)
def test_lang_initialize(lang):
def test_lang_initialize(lang, capfd):
"""Test that languages can be initialized."""
lang_cls = get_lang_class(lang)() # noqa: F841
nlp = get_lang_class(lang)() # noqa: F841
# Check for stray print statements (see #3342)
doc = nlp("test")
captured = capfd.readouterr()
assert not captured.out

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
import re
from spacy.matcher import Matcher
from spacy.tokens import Doc
from spacy.tokens import Doc, Span
pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}]
@ -129,3 +129,17 @@ def test_matcher_end_zero_plus(en_vocab):
assert len(matcher(nlp("a b c"))) == 2
assert len(matcher(nlp("a b b c"))) == 3
assert len(matcher(nlp("a b b"))) == 3
def test_matcher_sets_return_correct_tokens(en_vocab):
matcher = Matcher(en_vocab)
patterns = [
[{'LOWER': {'IN': ["zero"]}}],
[{'LOWER': {'IN': ["one"]}}],
[{'LOWER': {'IN': ["two"]}}],
]
matcher.add('TEST', None, *patterns)
doc = Doc(en_vocab, words="zero one two three".split())
matches = matcher(doc)
texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
assert texts == ['zero', 'one', 'two']

View File

@ -1,12 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.matcher import Matcher
from spacy.tokens import Doc
@pytest.mark.xfail
def test_issue3328(en_vocab):
doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
matcher = Matcher(en_vocab)

View File

@ -0,0 +1,21 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
@pytest.mark.xfail
def test_issue3331(en_vocab):
"""Test that duplicate patterns for different rules result in multiple
matches, one per rule.
"""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"]))
matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"]))
doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
matches = matcher(doc)
assert len(matches) == 2
match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
assert sorted(match_ids) == ["A", "B"]

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, GoldParse
from spacy.tokens import Doc
@ -62,3 +62,9 @@ def test_biluo_spans(en_tokenizer):
assert spans[0].label_ == "LOC"
assert spans[1].text == "London"
assert spans[1].label_ == "GPE"
def test_gold_ner_missing_tags(en_tokenizer):
doc = en_tokenizer("I flew to Silicon Valley via London.")
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
gold = GoldParse(doc, entities=biluo_tags) # noqa: F841

View File

@ -188,13 +188,18 @@ cdef class Doc:
@property
def is_sentenced(self):
# Check if the document has sentence boundaries,
# i.e at least one tok has the sent_start in (-1, 1)
"""Check if the document has sentence boundaries assigned. This is
defined as having at least one of the following:
a) An entry "sents" in doc.user_hooks";
b) sent.is_parsed is set to True;
c) At least one token other than the first where sent_start is not None.
"""
if 'sents' in self.user_hooks:
return True
if self.is_parsed:
return True
for i in range(self.length):
for i in range(1, self.length):
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
return True
else:
@ -569,6 +574,9 @@ cdef class Doc:
raise ValueError(Errors.E031.format(i=self.length))
t.spacy = has_space
self.length += 1
if self.length == 1:
# Set token.sent_start to 1 for first token. See issue #2869
self.c[0].sent_start = 1
return t.idx + t.lex.length + t.spacy
@cython.boundscheck(False)
@ -1000,8 +1008,10 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
tokens[i].r_kids = 0
tokens[i].l_edge = i
tokens[i].r_edge = i
# Twice, for non-projectivity
for loop_count in range(2):
# Three times, for non-projectivity
# See issue #3170. This isn't a very satisfying fix, but I think it's
# sufficient.
for loop_count in range(3):
# Set left edges
for i in range(length):
child = &tokens[i]

View File

@ -262,19 +262,49 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width]
[--depth] [--embed-rows] [--dropout] [--seed] [--n-iter] [--use-vectors]
```
| Argument | Type | Description |
| ---------------------- | ---------- | --------------------------------------------------------------------- |
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from. |
| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. |
| `output_dir` | positional | Directory to write models to on each epoch. |
| `--width`, `-cw` | option | Width of CNN layers. |
| `--depth`, `-cd` | option | Depth of CNN layers. |
| `--embed-rows`, `-er` | option | Number of embedding rows. |
| `--dropout`, `-d` | option | Dropout rate. |
| `--seed`, `-s` | option | Seed for random number generators. |
| `--n-iter`, `-i` | option | Number of iterations to pretrain. |
| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. |
| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. |
| Argument | Type | Description |
| ---------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------- |
| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"`. [See here](#pretrain-jsonl) for details. |
| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. |
| `output_dir` | positional | Directory to write models to on each epoch. |
| `--width`, `-cw` | option | Width of CNN layers. |
| `--depth`, `-cd` | option | Depth of CNN layers. |
| `--embed-rows`, `-er` | option | Number of embedding rows. |
| `--dropout`, `-d` | option | Dropout rate. |
| `--seed`, `-s` | option | Seed for random number generators. |
| `--n-iter`, `-i` | option | Number of iterations to pretrain. |
| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. |
| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. |
### JSONL format for raw text {#pretrain-jsonl}
Raw text can be provided as a `.jsonl` (newline-delimited JSON) file containing
one input text per line (roughly paragraph length is good). Optionally, custom
tokenization can be provided.
> #### Tip: Writing JSONL
>
> Our utility library [`srsly`](https://github.com/explosion/srsly) provides a
> handy `write_jsonl` helper that takes a file path and list of dictionaries and
> writes out JSONL-formatted data.
>
> ```python
> import srsly
> data = [{"text": "Some text"}, {"text": "More..."}]
> srsly.write_jsonl("/path/to/text.jsonl", data)
> ```
| Key | Type | Description |
| -------- | ------- | -------------------------------------------- |
| `text` | unicode | The raw input text. |
| `tokens` | list | Optional tokenization, one string per token. |
```json
### Example
{"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
{"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
{"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
```
## Init Model {#init-model new="2"}

View File

@ -14,16 +14,16 @@ examples of a label to have the value `1.0`, and negative examples of a label to
have the value `0.0`. Labels not in the dictionary are treated as missing the
gradient for those labels will be zero.
| Name | Type | Description |
| ----------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The document the annotations refer to. |
| `words` | iterable | A sequence of unicode word strings. |
| `tags` | iterable | A sequence of strings, representing tag annotations. |
| `heads` | iterable | A sequence of integers, representing syntactic head offsets. |
| `deps` | iterable | A sequence of strings, representing the syntactic relation types. |
| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. |
| `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). |
| **RETURNS** | `GoldParse` | The newly constructed object. |
| Name | Type | Description |
| ----------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The document the annotations refer to. |
| `words` | iterable | A sequence of unicode word strings. |
| `tags` | iterable | A sequence of strings, representing tag annotations. |
| `heads` | iterable | A sequence of integers, representing syntactic head offsets. |
| `deps` | iterable | A sequence of strings, representing the syntactic relation types. |
| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. |
| `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). |
| **RETURNS** | `GoldParse` | The newly constructed object. |
## GoldParse.\_\_len\_\_ {#len tag="method"}

View File

@ -324,7 +324,7 @@ A sequence containing the token and all the token's syntactic descendants.
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
A boolean value indicating whether the token starts a sentence. `None` if
unknown.
unknown. Defaults to `True` for the first token in the `doc`.
> #### Example
>

View File

@ -249,6 +249,9 @@ if all of your models are up to date, you can run the
+ sentence_splitter = nlp.create_pipe("sentencizer")
```
- The `is_sent_start` attribute of the first token in a `Doc` now correctly
defaults to `True`. It previously defaulted to `None`.
- The keyword argument `n_threads` on the `.pipe` methods is now deprecated, as
the v2.x models cannot release the global interpreter lock. (Future versions
may introduce a `n_process` argument for parallel inference via

View File

@ -11,11 +11,11 @@ export default function HTML(props) {
name="viewport"
content="width=device-width, initial-scale=1, shrink-to-fit=no"
/>
{props.headComponents}
<link
rel="stylesheet"
href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css"
/>
{props.headComponents}
</head>
<body {...props.bodyAttributes}>
{props.preBodyComponents}