mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Merge branch 'develop' into spacy.io
This commit is contained in:
commit
17038fe768
|
@ -214,9 +214,6 @@ def test_doc_retokenize_spans_entity_merge_iob():
|
||||||
retokenizer.merge(doc[2:4])
|
retokenizer.merge(doc[2:4])
|
||||||
retokenizer.merge(doc[4:6])
|
retokenizer.merge(doc[4:6])
|
||||||
retokenizer.merge(doc[7:9])
|
retokenizer.merge(doc[7:9])
|
||||||
for token in doc:
|
|
||||||
print(token)
|
|
||||||
print(token.ent_iob)
|
|
||||||
assert len(doc) == 6
|
assert len(doc) == 6
|
||||||
assert doc[3].ent_iob_ == "B"
|
assert doc[3].ent_iob_ == "B"
|
||||||
assert doc[4].ent_iob_ == "I"
|
assert doc[4].ent_iob_ == "I"
|
||||||
|
@ -270,16 +267,16 @@ def test_doc_retokenize_merge_extension_attrs(en_vocab):
|
||||||
attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
|
attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
|
||||||
retokenizer.merge(doc[0:2], attrs=attrs)
|
retokenizer.merge(doc[0:2], attrs=attrs)
|
||||||
assert doc[0].lemma_ == "hello world"
|
assert doc[0].lemma_ == "hello world"
|
||||||
assert doc[0]._.a == True
|
assert doc[0]._.a is True
|
||||||
assert doc[0]._.b == "1"
|
assert doc[0]._.b == "1"
|
||||||
# Test bulk merging
|
# Test bulk merging
|
||||||
doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
|
doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
|
retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
|
||||||
retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
|
retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
|
||||||
assert doc[0]._.a == True
|
assert doc[0]._.a is True
|
||||||
assert doc[0]._.b == "1"
|
assert doc[0]._.b == "1"
|
||||||
assert doc[1]._.a == None
|
assert doc[1]._.a is None
|
||||||
assert doc[1]._.b == "2"
|
assert doc[1]._.b == "2"
|
||||||
|
|
||||||
|
|
||||||
|
@ -292,3 +289,29 @@ def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[0:2], attrs=attrs)
|
retokenizer.merge(doc[0:2], attrs=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_retokenizer_merge_lex_attrs(en_vocab):
|
||||||
|
"""Test that retokenization also sets attributes on the lexeme if they're
|
||||||
|
lexical attributes. For example, if a user sets IS_STOP, it should mean that
|
||||||
|
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
|
||||||
|
here is acceptable. Also see #2390.
|
||||||
|
"""
|
||||||
|
# Test regular merging
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||||
|
assert not any(t.is_stop for t in doc)
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True})
|
||||||
|
assert doc[0].lemma_ == "hello world"
|
||||||
|
assert doc[0].is_stop
|
||||||
|
# Test bulk merging
|
||||||
|
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
|
||||||
|
assert not any(t.like_num for t in doc)
|
||||||
|
assert not any(t.is_stop for t in doc)
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[0:2], attrs={"like_num": True})
|
||||||
|
retokenizer.merge(doc[2:4], attrs={"is_stop": True})
|
||||||
|
assert doc[0].like_num
|
||||||
|
assert doc[1].is_stop
|
||||||
|
assert not doc[0].is_stop
|
||||||
|
assert not doc[1].like_num
|
||||||
|
|
|
@ -137,10 +137,10 @@ def test_doc_retokenize_split_extension_attrs(en_vocab):
|
||||||
attrs = {"lemma": ["los", "angeles"], "_": underscore}
|
attrs = {"lemma": ["los", "angeles"], "_": underscore}
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||||
assert doc[0].lemma_ == "los"
|
assert doc[0].lemma_ == "los"
|
||||||
assert doc[0]._.a == True
|
assert doc[0]._.a is True
|
||||||
assert doc[0]._.b == "1"
|
assert doc[0]._.b == "1"
|
||||||
assert doc[1].lemma_ == "angeles"
|
assert doc[1].lemma_ == "angeles"
|
||||||
assert doc[1]._.a == False
|
assert doc[1]._.a is False
|
||||||
assert doc[1]._.b == "2"
|
assert doc[1]._.b == "2"
|
||||||
|
|
||||||
|
|
||||||
|
@ -165,3 +165,21 @@ def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
heads = [(doc[0], 1), doc[1]]
|
heads = [(doc[0], 1), doc[1]]
|
||||||
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_retokenizer_split_lex_attrs(en_vocab):
|
||||||
|
"""Test that retokenization also sets attributes on the lexeme if they're
|
||||||
|
lexical attributes. For example, if a user sets IS_STOP, it should mean that
|
||||||
|
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
|
||||||
|
here is acceptable. Also see #2390.
|
||||||
|
"""
|
||||||
|
assert not Doc(en_vocab, words=["Los"])[0].is_stop
|
||||||
|
assert not Doc(en_vocab, words=["Angeles"])[0].is_stop
|
||||||
|
doc = Doc(en_vocab, words=["LosAngeles", "start"])
|
||||||
|
assert not doc[0].is_stop
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
attrs = {"is_stop": [True, False]}
|
||||||
|
heads = [(doc[0], 1), doc[1]]
|
||||||
|
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||||
|
assert doc[0].is_stop
|
||||||
|
assert not doc[1].is_stop
|
||||||
|
|
|
@ -1,89 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import json
|
|
||||||
from tempfile import NamedTemporaryFile
|
|
||||||
|
|
||||||
from spacy.cli.train import train
|
|
||||||
|
|
||||||
|
|
||||||
def test_cli_trained_model_can_be_saved(tmpdir):
|
|
||||||
lang = "nl"
|
|
||||||
output_dir = str(tmpdir)
|
|
||||||
train_file = NamedTemporaryFile("wb", dir=output_dir, delete=False)
|
|
||||||
train_corpus = [
|
|
||||||
{
|
|
||||||
"id": "identifier_0",
|
|
||||||
"paragraphs": [
|
|
||||||
{
|
|
||||||
"raw": "Jan houdt van Marie.\n",
|
|
||||||
"sentences": [
|
|
||||||
{
|
|
||||||
"tokens": [
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"dep": "nsubj",
|
|
||||||
"head": 1,
|
|
||||||
"tag": "NOUN",
|
|
||||||
"orth": "Jan",
|
|
||||||
"ner": "B-PER",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"dep": "ROOT",
|
|
||||||
"head": 0,
|
|
||||||
"tag": "VERB",
|
|
||||||
"orth": "houdt",
|
|
||||||
"ner": "O",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"dep": "case",
|
|
||||||
"head": 1,
|
|
||||||
"tag": "ADP",
|
|
||||||
"orth": "van",
|
|
||||||
"ner": "O",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"dep": "obj",
|
|
||||||
"head": -2,
|
|
||||||
"tag": "NOUN",
|
|
||||||
"orth": "Marie",
|
|
||||||
"ner": "B-PER",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 4,
|
|
||||||
"dep": "punct",
|
|
||||||
"head": -3,
|
|
||||||
"tag": "PUNCT",
|
|
||||||
"orth": ".",
|
|
||||||
"ner": "O",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 5,
|
|
||||||
"dep": "",
|
|
||||||
"head": -1,
|
|
||||||
"tag": "SPACE",
|
|
||||||
"orth": "\n",
|
|
||||||
"ner": "O",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"brackets": [],
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
train_file.write(json.dumps(train_corpus).encode("utf-8"))
|
|
||||||
train_file.close()
|
|
||||||
train_data = train_file.name
|
|
||||||
dev_data = train_data
|
|
||||||
|
|
||||||
# spacy train -n 1 -g -1 nl output_nl training_corpus.json training \
|
|
||||||
# corpus.json
|
|
||||||
train(lang, output_dir, train_data, dev_data, n_iter=1)
|
|
||||||
|
|
||||||
assert True
|
|
|
@ -1,25 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import random
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
|
|
||||||
def test_train_with_many_entity_types():
|
|
||||||
"""Test issue that arises when too many labels are added to NER model.
|
|
||||||
NB: currently causes segfault!
|
|
||||||
"""
|
|
||||||
train_data = []
|
|
||||||
train_data.extend([("One sentence", {"entities": []})])
|
|
||||||
entity_types = [str(i) for i in range(1000)]
|
|
||||||
nlp = English(pipeline=[])
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
for entity_type in list(entity_types):
|
|
||||||
ner.add_label(entity_type)
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for i in range(20):
|
|
||||||
losses = {}
|
|
||||||
random.shuffle(train_data)
|
|
||||||
for statement, entities in train_data:
|
|
||||||
nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5)
|
|
|
@ -11,7 +11,7 @@ from spacy.lang.lex_attrs import is_stop
|
||||||
from spacy.vectors import Vectors
|
from spacy.vectors import Vectors
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span, Token
|
||||||
from spacy.pipeline import Tagger, EntityRecognizer
|
from spacy.pipeline import Tagger, EntityRecognizer
|
||||||
from spacy.attrs import HEAD, DEP
|
from spacy.attrs import HEAD, DEP
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
@ -272,3 +272,60 @@ def test_issue1967(label):
|
||||||
entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
|
entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
|
||||||
gold_parses = [(None, [(entry, None)])]
|
gold_parses = [(None, [(entry, None)])]
|
||||||
ner.moves.get_actions(gold_parses=gold_parses)
|
ner.moves.get_actions(gold_parses=gold_parses)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue1971(en_vocab):
|
||||||
|
# Possibly related to #2675 and #2671?
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [
|
||||||
|
{"ORTH": "Doe"},
|
||||||
|
{"ORTH": "!", "OP": "?"},
|
||||||
|
{"_": {"optional": True}, "OP": "?"},
|
||||||
|
{"ORTH": "!", "OP": "?"},
|
||||||
|
]
|
||||||
|
Token.set_extension("optional", default=False)
|
||||||
|
matcher.add("TEST", None, pattern)
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
|
||||||
|
# We could also assert length 1 here, but this is more conclusive, because
|
||||||
|
# the real problem here is that it returns a duplicate match for a match_id
|
||||||
|
# that's not actually in the vocab!
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert all([match_id in en_vocab.strings for match_id, start, end in matches])
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_1971_2(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
|
||||||
|
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
|
||||||
|
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
|
||||||
|
matcher.add("TEST1", None, pattern1, pattern2)
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_1971_3(en_vocab):
|
||||||
|
"""Test that pattern matches correctly for multiple extension attributes."""
|
||||||
|
Token.set_extension("a", default=1, force=True)
|
||||||
|
Token.set_extension("b", default=2, force=True)
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("A", None, [{"_": {"a": 1}}])
|
||||||
|
matcher.add("B", None, [{"_": {"b": 2}}])
|
||||||
|
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
|
||||||
|
assert len(matches) == 4
|
||||||
|
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_1971_4(en_vocab):
|
||||||
|
"""Test that pattern matches correctly with multiple extension attribute
|
||||||
|
values on a single token.
|
||||||
|
"""
|
||||||
|
Token.set_extension("ext_a", default="str_a", force=True)
|
||||||
|
Token.set_extension("ext_b", default="str_b", force=True)
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
doc = Doc(en_vocab, words=["this", "is", "text"])
|
||||||
|
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
|
||||||
|
matcher.add("TEST", None, pattern)
|
||||||
|
matches = matcher(doc)
|
||||||
|
# Uncommenting this caused a segmentation fault
|
||||||
|
assert len(matches) == 1
|
||||||
|
|
|
@ -1,62 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Token, Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue1971(en_vocab):
|
|
||||||
# Possibly related to #2675 and #2671?
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
pattern = [
|
|
||||||
{"ORTH": "Doe"},
|
|
||||||
{"ORTH": "!", "OP": "?"},
|
|
||||||
{"_": {"optional": True}, "OP": "?"},
|
|
||||||
{"ORTH": "!", "OP": "?"},
|
|
||||||
]
|
|
||||||
Token.set_extension("optional", default=False)
|
|
||||||
matcher.add("TEST", None, pattern)
|
|
||||||
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
|
|
||||||
# We could also assert length 1 here, but this is more conclusive, because
|
|
||||||
# the real problem here is that it returns a duplicate match for a match_id
|
|
||||||
# that's not actually in the vocab!
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert all([match_id in en_vocab.strings for match_id, start, end in matches])
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue_1971_2(en_vocab):
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
|
|
||||||
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
|
|
||||||
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
|
|
||||||
matcher.add("TEST1", None, pattern1, pattern2)
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue_1971_3(en_vocab):
|
|
||||||
"""Test that pattern matches correctly for multiple extension attributes."""
|
|
||||||
Token.set_extension("a", default=1, force=True)
|
|
||||||
Token.set_extension("b", default=2, force=True)
|
|
||||||
doc = Doc(en_vocab, words=["hello", "world"])
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("A", None, [{"_": {"a": 1}}])
|
|
||||||
matcher.add("B", None, [{"_": {"b": 2}}])
|
|
||||||
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
|
|
||||||
assert len(matches) == 4
|
|
||||||
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue_1971_4(en_vocab):
|
|
||||||
"""Test that pattern matches correctly with multiple extension attribute
|
|
||||||
values on a single token.
|
|
||||||
"""
|
|
||||||
Token.set_extension("ext_a", default="str_a", force=True)
|
|
||||||
Token.set_extension("ext_b", default="str_b", force=True)
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
doc = Doc(en_vocab, words=["this", "is", "text"])
|
|
||||||
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
|
|
||||||
matcher.add("TEST", None, pattern)
|
|
||||||
matches = matcher(doc)
|
|
||||||
# Uncommenting this caused a segmentation fault
|
|
||||||
assert len(matches) == 1
|
|
|
@ -2,15 +2,18 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from spacy import displacy
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.ja import Japanese
|
from spacy.lang.ja import Japanese
|
||||||
from spacy.lang.xx import MultiLanguage
|
from spacy.lang.xx import MultiLanguage
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.compat import pickle
|
||||||
from spacy._ml import link_vectors_to_models
|
from spacy._ml import link_vectors_to_models
|
||||||
import numpy
|
import numpy
|
||||||
|
import random
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
@ -54,6 +57,25 @@ def test_issue2626_2835(en_tokenizer, text):
|
||||||
assert doc
|
assert doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue2656(en_tokenizer):
|
||||||
|
"""Test that tokenizer correctly splits of punctuation after numbers with
|
||||||
|
decimal points.
|
||||||
|
"""
|
||||||
|
doc = en_tokenizer("I went for 40.3, and got home by 10.0.")
|
||||||
|
assert len(doc) == 11
|
||||||
|
assert doc[0].text == "I"
|
||||||
|
assert doc[1].text == "went"
|
||||||
|
assert doc[2].text == "for"
|
||||||
|
assert doc[3].text == "40.3"
|
||||||
|
assert doc[4].text == ","
|
||||||
|
assert doc[5].text == "and"
|
||||||
|
assert doc[6].text == "got"
|
||||||
|
assert doc[7].text == "home"
|
||||||
|
assert doc[8].text == "by"
|
||||||
|
assert doc[9].text == "10.0"
|
||||||
|
assert doc[10].text == "."
|
||||||
|
|
||||||
|
|
||||||
def test_issue2671():
|
def test_issue2671():
|
||||||
"""Ensure the correct entity ID is returned for matches with quantifiers.
|
"""Ensure the correct entity ID is returned for matches with quantifiers.
|
||||||
See also #2675
|
See also #2675
|
||||||
|
@ -77,6 +99,17 @@ def test_issue2671():
|
||||||
assert nlp.vocab.strings[match_id] == pattern_id
|
assert nlp.vocab.strings[match_id] == pattern_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue2728(en_vocab):
|
||||||
|
"""Test that displaCy ENT visualizer escapes HTML correctly."""
|
||||||
|
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
|
||||||
|
doc.ents = [Span(doc, 0, 1, label="TEST")]
|
||||||
|
html = displacy.render(doc, style="ent")
|
||||||
|
assert "<RELEASE>" in html
|
||||||
|
doc.ents = [Span(doc, 1, 2, label="TEST")]
|
||||||
|
html = displacy.render(doc, style="ent")
|
||||||
|
assert "<RELEASE>" in html
|
||||||
|
|
||||||
|
|
||||||
def test_issue2754(en_tokenizer):
|
def test_issue2754(en_tokenizer):
|
||||||
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
|
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
|
||||||
a = en_tokenizer("a")
|
a = en_tokenizer("a")
|
||||||
|
@ -106,6 +139,48 @@ def test_issue2782(text, lang_cls):
|
||||||
assert doc[0].like_num
|
assert doc[0].like_num
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue2800():
|
||||||
|
"""Test issue that arises when too many labels are added to NER model.
|
||||||
|
Used to cause segfault.
|
||||||
|
"""
|
||||||
|
train_data = []
|
||||||
|
train_data.extend([("One sentence", {"entities": []})])
|
||||||
|
entity_types = [str(i) for i in range(1000)]
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
for entity_type in list(entity_types):
|
||||||
|
ner.add_label(entity_type)
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(20):
|
||||||
|
losses = {}
|
||||||
|
random.shuffle(train_data)
|
||||||
|
for statement, entities in train_data:
|
||||||
|
nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue2822(it_tokenizer):
|
||||||
|
"""Test that the abbreviation of poco is kept as one word."""
|
||||||
|
doc = it_tokenizer("Vuoi un po' di zucchero?")
|
||||||
|
assert len(doc) == 6
|
||||||
|
assert doc[0].text == "Vuoi"
|
||||||
|
assert doc[1].text == "un"
|
||||||
|
assert doc[2].text == "po'"
|
||||||
|
assert doc[2].lemma_ == "poco"
|
||||||
|
assert doc[3].text == "di"
|
||||||
|
assert doc[4].text == "zucchero"
|
||||||
|
assert doc[5].text == "?"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue2833(en_vocab):
|
||||||
|
"""Test that a custom error is raised if a token or span is pickled."""
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
pickle.dumps(doc[0])
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
pickle.dumps(doc[0:2])
|
||||||
|
|
||||||
|
|
||||||
def test_issue2871():
|
def test_issue2871():
|
||||||
"""Test that vectors recover the correct key for spaCy reserved words."""
|
"""Test that vectors recover the correct key for spaCy reserved words."""
|
||||||
words = ["dog", "cat", "SUFFIX"]
|
words = ["dog", "cat", "SUFFIX"]
|
||||||
|
@ -134,3 +209,19 @@ def test_issue2901():
|
||||||
|
|
||||||
doc = nlp("pythonが大好きです")
|
doc = nlp("pythonが大好きです")
|
||||||
assert doc
|
assert doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue2926(fr_tokenizer):
|
||||||
|
"""Test that the tokenizer correctly splits tokens separated by a slash (/)
|
||||||
|
ending in a digit.
|
||||||
|
"""
|
||||||
|
doc = fr_tokenizer("Learn html5/css3/javascript/jquery")
|
||||||
|
assert len(doc) == 8
|
||||||
|
assert doc[0].text == "Learn"
|
||||||
|
assert doc[1].text == "html5"
|
||||||
|
assert doc[2].text == "/"
|
||||||
|
assert doc[3].text == "css3"
|
||||||
|
assert doc[4].text == "/"
|
||||||
|
assert doc[5].text == "javascript"
|
||||||
|
assert doc[6].text == "/"
|
||||||
|
assert doc[7].text == "jquery"
|
||||||
|
|
|
@ -1,24 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue2656():
|
|
||||||
""" Test that tokenizer correctly splits of punctuation after numbers with decimal points """
|
|
||||||
text = "I went for 40.3, and got home by 10.0."
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(text)
|
|
||||||
|
|
||||||
assert len(doc) == 11
|
|
||||||
|
|
||||||
assert doc[0].text == "I"
|
|
||||||
assert doc[1].text == "went"
|
|
||||||
assert doc[2].text == "for"
|
|
||||||
assert doc[3].text == "40.3"
|
|
||||||
assert doc[4].text == ","
|
|
||||||
assert doc[5].text == "and"
|
|
||||||
assert doc[6].text == "got"
|
|
||||||
assert doc[7].text == "home"
|
|
||||||
assert doc[8].text == "by"
|
|
||||||
assert doc[9].text == "10.0"
|
|
||||||
assert doc[10].text == "."
|
|
|
@ -1,16 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy import displacy
|
|
||||||
from spacy.tokens import Doc, Span
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue2728(en_vocab):
|
|
||||||
"""Test that displaCy ENT visualizer escapes HTML correctly."""
|
|
||||||
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
|
|
||||||
doc.ents = [Span(doc, 0, 1, label="TEST")]
|
|
||||||
html = displacy.render(doc, style="ent")
|
|
||||||
assert "<RELEASE>" in html
|
|
||||||
doc.ents = [Span(doc, 1, 2, label="TEST")]
|
|
||||||
html = displacy.render(doc, style="ent")
|
|
||||||
assert "<RELEASE>" in html
|
|
|
@ -1,21 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from spacy.lang.it import Italian
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue2822():
|
|
||||||
""" Test that the abbreviation of poco is kept as one word """
|
|
||||||
nlp = Italian()
|
|
||||||
text = "Vuoi un po' di zucchero?"
|
|
||||||
|
|
||||||
doc = nlp(text)
|
|
||||||
|
|
||||||
assert len(doc) == 6
|
|
||||||
|
|
||||||
assert doc[0].text == "Vuoi"
|
|
||||||
assert doc[1].text == "un"
|
|
||||||
assert doc[2].text == "po'"
|
|
||||||
assert doc[2].lemma_ == "poco"
|
|
||||||
assert doc[3].text == "di"
|
|
||||||
assert doc[4].text == "zucchero"
|
|
||||||
assert doc[5].text == "?"
|
|
|
@ -1,15 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.compat import pickle
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue2833(en_vocab):
|
|
||||||
"""Test that a custom error is raised if a token or span is pickled."""
|
|
||||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
|
||||||
with pytest.raises(NotImplementedError):
|
|
||||||
pickle.dumps(doc[0])
|
|
||||||
with pytest.raises(NotImplementedError):
|
|
||||||
pickle.dumps(doc[0:2])
|
|
|
@ -1,21 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from spacy.lang.fr import French
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue2926():
|
|
||||||
""" Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """
|
|
||||||
nlp = French()
|
|
||||||
text = "Learn html5/css3/javascript/jquery"
|
|
||||||
doc = nlp(text)
|
|
||||||
|
|
||||||
assert len(doc) == 8
|
|
||||||
|
|
||||||
assert doc[0].text == "Learn"
|
|
||||||
assert doc[1].text == "html5"
|
|
||||||
assert doc[2].text == "/"
|
|
||||||
assert doc[3].text == "css3"
|
|
||||||
assert doc[4].text == "/"
|
|
||||||
assert doc[5].text == "javascript"
|
|
||||||
assert doc[6].text == "/"
|
|
||||||
assert doc[7].text == "jquery"
|
|
|
@ -2,10 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue3209():
|
def test_issue3209():
|
||||||
"""Test issue that occurred in spaCy nightly where NER labels were being
|
"""Test issue that occurred in spaCy nightly where NER labels were being
|
||||||
mapped to classes incorrectly after loading the model, when the labels
|
mapped to classes incorrectly after loading the model, when the labels
|
||||||
|
|
|
@ -66,9 +66,13 @@ cdef class Retokenizer:
|
||||||
for extension in extensions:
|
for extension in extensions:
|
||||||
_validate_extensions(extension)
|
_validate_extensions(extension)
|
||||||
attrs = {key: value for key, value in attrs.items() if key != "_"}
|
attrs = {key: value for key, value in attrs.items() if key != "_"}
|
||||||
|
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
||||||
|
# will only "intify" the keys, not the values
|
||||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||||
attrs["_"] = extensions
|
attrs["_"] = extensions
|
||||||
else:
|
else:
|
||||||
|
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
||||||
|
# will only "intify" the keys, not the values
|
||||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||||
head_offsets = []
|
head_offsets = []
|
||||||
for head in heads:
|
for head in heads:
|
||||||
|
@ -153,7 +157,11 @@ def _merge(Doc doc, int start, int end, attributes):
|
||||||
elif attr_name == TAG:
|
elif attr_name == TAG:
|
||||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||||
else:
|
else:
|
||||||
|
# Set attributes on both token and lexeme to take care of token
|
||||||
|
# attribute vs. lexical attribute without having to enumerate them.
|
||||||
|
# If an attribute name is not valid, set_struct_attr will ignore it.
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
|
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
||||||
# Make sure ent_iob remains consistent
|
# Make sure ent_iob remains consistent
|
||||||
if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
|
if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
|
||||||
if token.ent_type == doc.c[end].ent_type:
|
if token.ent_type == doc.c[end].ent_type:
|
||||||
|
@ -216,6 +224,7 @@ def _bulk_merge(Doc doc, merges):
|
||||||
"""
|
"""
|
||||||
cdef Span span
|
cdef Span span
|
||||||
cdef const LexemeC* lex
|
cdef const LexemeC* lex
|
||||||
|
cdef TokenC* token
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
|
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
|
||||||
spans = []
|
spans = []
|
||||||
|
@ -231,15 +240,6 @@ def _bulk_merge(Doc doc, merges):
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
token = &doc.c[start]
|
token = &doc.c[start]
|
||||||
tokens[merge_index] = token
|
tokens[merge_index] = token
|
||||||
# Assign attributes
|
|
||||||
for attr_name, attr_value in attributes.items():
|
|
||||||
if attr_name == "_": # Set extension attributes
|
|
||||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
|
||||||
doc[start]._.set(ext_attr_key, ext_attr_value)
|
|
||||||
elif attr_name == TAG:
|
|
||||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
|
||||||
else:
|
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
|
||||||
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
||||||
# for the merged region. To do this, we create a boolean array indicating
|
# for the merged region. To do this, we create a boolean array indicating
|
||||||
# whether the row is to be deleted, then use numpy.delete
|
# whether the row is to be deleted, then use numpy.delete
|
||||||
|
@ -255,14 +255,30 @@ def _bulk_merge(Doc doc, merges):
|
||||||
# We update token.lex after keeping span root and dep, since
|
# We update token.lex after keeping span root and dep, since
|
||||||
# setting token.lex will change span.start and span.end properties
|
# setting token.lex will change span.start and span.end properties
|
||||||
# as it modifies the character offsets in the doc
|
# as it modifies the character offsets in the doc
|
||||||
for token_index in range(len(merges)):
|
for token_index, (span, attributes) in enumerate(merges):
|
||||||
new_orth = ''.join([t.text_with_ws for t in spans[token_index]])
|
new_orth = ''.join([t.text_with_ws for t in spans[token_index]])
|
||||||
if spans[token_index][-1].whitespace_:
|
if spans[token_index][-1].whitespace_:
|
||||||
new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)]
|
new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)]
|
||||||
|
token = tokens[token_index]
|
||||||
lex = doc.vocab.get(doc.mem, new_orth)
|
lex = doc.vocab.get(doc.mem, new_orth)
|
||||||
tokens[token_index].lex = lex
|
token.lex = lex
|
||||||
# We set trailing space here too
|
# We set trailing space here too
|
||||||
tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
|
token.spacy = doc.c[spans[token_index].end-1].spacy
|
||||||
|
py_token = span[0]
|
||||||
|
# Assign attributes
|
||||||
|
for attr_name, attr_value in attributes.items():
|
||||||
|
if attr_name == "_": # Set extension attributes
|
||||||
|
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||||
|
py_token._.set(ext_attr_key, ext_attr_value)
|
||||||
|
elif attr_name == TAG:
|
||||||
|
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||||
|
else:
|
||||||
|
# Set attributes on both token and lexeme to take care of token
|
||||||
|
# attribute vs. lexical attribute without having to enumerate
|
||||||
|
# them. If an attribute name is not valid, set_struct_attr will
|
||||||
|
# ignore it.
|
||||||
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
|
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
# Before thinking of something simpler, beware the case where a
|
# Before thinking of something simpler, beware the case where a
|
||||||
|
@ -281,7 +297,7 @@ def _bulk_merge(Doc doc, merges):
|
||||||
current_offset = 0
|
current_offset = 0
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
if current_span_index < len(spans) and i == spans[current_span_index].end:
|
if current_span_index < len(spans) and i == spans[current_span_index].end:
|
||||||
#last token was the last of the span
|
# Last token was the last of the span
|
||||||
current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
|
current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
|
||||||
current_span_index += 1
|
current_span_index += 1
|
||||||
if current_span_index < len(spans) and \
|
if current_span_index < len(spans) and \
|
||||||
|
@ -405,10 +421,17 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
if attr_name == "_":
|
if attr_name == "_":
|
||||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||||
doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
|
doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
|
||||||
|
# NB: We need to call get_string_id here because only the keys are
|
||||||
|
# "intified" (since we support "KEY": [value, value] syntax here).
|
||||||
elif attr_name == TAG:
|
elif attr_name == TAG:
|
||||||
doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
|
doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
|
||||||
else:
|
else:
|
||||||
|
# Set attributes on both token and lexeme to take care of token
|
||||||
|
# attribute vs. lexical attribute without having to enumerate
|
||||||
|
# them. If an attribute name is not valid, set_struct_attr will
|
||||||
|
# ignore it.
|
||||||
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
||||||
|
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
|
||||||
# Assign correct dependencies to the inner token
|
# Assign correct dependencies to the inner token
|
||||||
for i, head in enumerate(heads):
|
for i, head in enumerate(heads):
|
||||||
doc.c[token_index + i].head = head
|
doc.c[token_index + i].head = head
|
||||||
|
|
|
@ -402,9 +402,11 @@ invalidated, although they may accidentally continue to work.
|
||||||
|
|
||||||
### Retokenizer.merge {#retokenizer.merge tag="method"}
|
### Retokenizer.merge {#retokenizer.merge tag="method"}
|
||||||
|
|
||||||
Mark a span for merging. The `attrs` will be applied to the resulting token.
|
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
||||||
Writable custom extension attributes can be provided as a dictionary mapping
|
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
||||||
attribute names to values as the `"_"` key.
|
underlying lexeme (if they're context-independent lexical attributes like
|
||||||
|
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a
|
||||||
|
dictionary mapping attribute names to values as the `"_"` key.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -431,7 +433,10 @@ second subtoken of `doc[3]`. This mechanism allows attaching subtokens to other
|
||||||
newly created subtokens, without having to keep track of the changing token
|
newly created subtokens, without having to keep track of the changing token
|
||||||
indices. If the specified head token will be split within the retokenizer block
|
indices. If the specified head token will be split within the retokenizer block
|
||||||
and no subtoken index is specified, it will default to `0`. Attributes to set on
|
and no subtoken index is specified, it will default to `0`. Attributes to set on
|
||||||
subtokens can be provided as a list of values.
|
subtokens can be provided as a list of values. They'll be applied to the
|
||||||
|
resulting token (if they're context-dependent token attributes like `LEMMA` or
|
||||||
|
`DEP`) or to the underlying lexeme (if they're context-independent lexical
|
||||||
|
attributes like `LOWER` or `IS_STOP`).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
|
@ -995,6 +995,14 @@ with doc.retokenize() as retokenizer:
|
||||||
print("After:", [token.text for token in doc])
|
print("After:", [token.text for token in doc])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If an attribute in the `attrs` is a context-dependent token attribute, it will
|
||||||
|
be applied to the underlying [`Token`](/api/token). For example `LEMMA`, `POS`
|
||||||
|
or `DEP` only apply to a word in context, so they're token attributes. If an
|
||||||
|
attribute is a context-independent lexical attribute, it will be applied to the
|
||||||
|
underlying [`Lexeme`](/api/lexeme), the entry in the vocabulary. For example,
|
||||||
|
`LOWER` or `IS_STOP` apply to all words of the same spelling, regardless of the
|
||||||
|
context.
|
||||||
|
|
||||||
<Infobox title="Tip: merging entities and noun phrases">
|
<Infobox title="Tip: merging entities and noun phrases">
|
||||||
|
|
||||||
If you need to merge named entities or noun chunks, check out the built-in
|
If you need to merge named entities or noun chunks, check out the built-in
|
||||||
|
|
Loading…
Reference in New Issue
Block a user