mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge branch 'develop' into spacy.io
This commit is contained in:
commit
17038fe768
|
@ -89,7 +89,7 @@ cdef class Lexeme:
|
|||
return lex.lang
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
@staticmethod
|
||||
cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||
cdef flags_t one = 1
|
||||
|
|
|
@ -214,9 +214,6 @@ def test_doc_retokenize_spans_entity_merge_iob():
|
|||
retokenizer.merge(doc[2:4])
|
||||
retokenizer.merge(doc[4:6])
|
||||
retokenizer.merge(doc[7:9])
|
||||
for token in doc:
|
||||
print(token)
|
||||
print(token.ent_iob)
|
||||
assert len(doc) == 6
|
||||
assert doc[3].ent_iob_ == "B"
|
||||
assert doc[4].ent_iob_ == "I"
|
||||
|
@ -270,16 +267,16 @@ def test_doc_retokenize_merge_extension_attrs(en_vocab):
|
|||
attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
|
||||
retokenizer.merge(doc[0:2], attrs=attrs)
|
||||
assert doc[0].lemma_ == "hello world"
|
||||
assert doc[0]._.a == True
|
||||
assert doc[0]._.a is True
|
||||
assert doc[0]._.b == "1"
|
||||
# Test bulk merging
|
||||
doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
|
||||
retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
|
||||
assert doc[0]._.a == True
|
||||
assert doc[0]._.a is True
|
||||
assert doc[0]._.b == "1"
|
||||
assert doc[1]._.a == None
|
||||
assert doc[1]._.a is None
|
||||
assert doc[1]._.b == "2"
|
||||
|
||||
|
||||
|
@ -292,3 +289,29 @@ def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs
|
|||
with pytest.raises(ValueError):
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2], attrs=attrs)
|
||||
|
||||
|
||||
def test_doc_retokenizer_merge_lex_attrs(en_vocab):
|
||||
"""Test that retokenization also sets attributes on the lexeme if they're
|
||||
lexical attributes. For example, if a user sets IS_STOP, it should mean that
|
||||
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
|
||||
here is acceptable. Also see #2390.
|
||||
"""
|
||||
# Test regular merging
|
||||
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||
assert not any(t.is_stop for t in doc)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True})
|
||||
assert doc[0].lemma_ == "hello world"
|
||||
assert doc[0].is_stop
|
||||
# Test bulk merging
|
||||
doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
|
||||
assert not any(t.like_num for t in doc)
|
||||
assert not any(t.is_stop for t in doc)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2], attrs={"like_num": True})
|
||||
retokenizer.merge(doc[2:4], attrs={"is_stop": True})
|
||||
assert doc[0].like_num
|
||||
assert doc[1].is_stop
|
||||
assert not doc[0].is_stop
|
||||
assert not doc[1].like_num
|
||||
|
|
|
@ -137,10 +137,10 @@ def test_doc_retokenize_split_extension_attrs(en_vocab):
|
|||
attrs = {"lemma": ["los", "angeles"], "_": underscore}
|
||||
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||
assert doc[0].lemma_ == "los"
|
||||
assert doc[0]._.a == True
|
||||
assert doc[0]._.a is True
|
||||
assert doc[0]._.b == "1"
|
||||
assert doc[1].lemma_ == "angeles"
|
||||
assert doc[1]._.a == False
|
||||
assert doc[1]._.a is False
|
||||
assert doc[1]._.b == "2"
|
||||
|
||||
|
||||
|
@ -165,3 +165,21 @@ def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs
|
|||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[0], 1), doc[1]]
|
||||
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||
|
||||
|
||||
def test_doc_retokenizer_split_lex_attrs(en_vocab):
|
||||
"""Test that retokenization also sets attributes on the lexeme if they're
|
||||
lexical attributes. For example, if a user sets IS_STOP, it should mean that
|
||||
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
|
||||
here is acceptable. Also see #2390.
|
||||
"""
|
||||
assert not Doc(en_vocab, words=["Los"])[0].is_stop
|
||||
assert not Doc(en_vocab, words=["Angeles"])[0].is_stop
|
||||
doc = Doc(en_vocab, words=["LosAngeles", "start"])
|
||||
assert not doc[0].is_stop
|
||||
with doc.retokenize() as retokenizer:
|
||||
attrs = {"is_stop": [True, False]}
|
||||
heads = [(doc[0], 1), doc[1]]
|
||||
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
|
||||
assert doc[0].is_stop
|
||||
assert not doc[1].is_stop
|
||||
|
|
|
@ -1,89 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
from spacy.cli.train import train
|
||||
|
||||
|
||||
def test_cli_trained_model_can_be_saved(tmpdir):
|
||||
lang = "nl"
|
||||
output_dir = str(tmpdir)
|
||||
train_file = NamedTemporaryFile("wb", dir=output_dir, delete=False)
|
||||
train_corpus = [
|
||||
{
|
||||
"id": "identifier_0",
|
||||
"paragraphs": [
|
||||
{
|
||||
"raw": "Jan houdt van Marie.\n",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{
|
||||
"id": 0,
|
||||
"dep": "nsubj",
|
||||
"head": 1,
|
||||
"tag": "NOUN",
|
||||
"orth": "Jan",
|
||||
"ner": "B-PER",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"dep": "ROOT",
|
||||
"head": 0,
|
||||
"tag": "VERB",
|
||||
"orth": "houdt",
|
||||
"ner": "O",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"dep": "case",
|
||||
"head": 1,
|
||||
"tag": "ADP",
|
||||
"orth": "van",
|
||||
"ner": "O",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"dep": "obj",
|
||||
"head": -2,
|
||||
"tag": "NOUN",
|
||||
"orth": "Marie",
|
||||
"ner": "B-PER",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"dep": "punct",
|
||||
"head": -3,
|
||||
"tag": "PUNCT",
|
||||
"orth": ".",
|
||||
"ner": "O",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"dep": "",
|
||||
"head": -1,
|
||||
"tag": "SPACE",
|
||||
"orth": "\n",
|
||||
"ner": "O",
|
||||
},
|
||||
],
|
||||
"brackets": [],
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
train_file.write(json.dumps(train_corpus).encode("utf-8"))
|
||||
train_file.close()
|
||||
train_data = train_file.name
|
||||
dev_data = train_data
|
||||
|
||||
# spacy train -n 1 -g -1 nl output_nl training_corpus.json training \
|
||||
# corpus.json
|
||||
train(lang, output_dir, train_data, dev_data, n_iter=1)
|
||||
|
||||
assert True
|
|
@ -1,25 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import random
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
def test_train_with_many_entity_types():
|
||||
"""Test issue that arises when too many labels are added to NER model.
|
||||
NB: currently causes segfault!
|
||||
"""
|
||||
train_data = []
|
||||
train_data.extend([("One sentence", {"entities": []})])
|
||||
entity_types = [str(i) for i in range(1000)]
|
||||
nlp = English(pipeline=[])
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
for entity_type in list(entity_types):
|
||||
ner.add_label(entity_type)
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(20):
|
||||
losses = {}
|
||||
random.shuffle(train_data)
|
||||
for statement, entities in train_data:
|
||||
nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5)
|
|
@ -11,7 +11,7 @@ from spacy.lang.lex_attrs import is_stop
|
|||
from spacy.vectors import Vectors
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
from spacy.pipeline import Tagger, EntityRecognizer
|
||||
from spacy.attrs import HEAD, DEP
|
||||
from spacy.matcher import Matcher
|
||||
|
@ -272,3 +272,60 @@ def test_issue1967(label):
|
|||
entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
|
||||
gold_parses = [(None, [(entry, None)])]
|
||||
ner.moves.get_actions(gold_parses=gold_parses)
|
||||
|
||||
|
||||
def test_issue1971(en_vocab):
|
||||
# Possibly related to #2675 and #2671?
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [
|
||||
{"ORTH": "Doe"},
|
||||
{"ORTH": "!", "OP": "?"},
|
||||
{"_": {"optional": True}, "OP": "?"},
|
||||
{"ORTH": "!", "OP": "?"},
|
||||
]
|
||||
Token.set_extension("optional", default=False)
|
||||
matcher.add("TEST", None, pattern)
|
||||
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
|
||||
# We could also assert length 1 here, but this is more conclusive, because
|
||||
# the real problem here is that it returns a duplicate match for a match_id
|
||||
# that's not actually in the vocab!
|
||||
matches = matcher(doc)
|
||||
assert all([match_id in en_vocab.strings for match_id, start, end in matches])
|
||||
|
||||
|
||||
def test_issue_1971_2(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
|
||||
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
|
||||
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
|
||||
matcher.add("TEST1", None, pattern1, pattern2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
|
||||
|
||||
def test_issue_1971_3(en_vocab):
|
||||
"""Test that pattern matches correctly for multiple extension attributes."""
|
||||
Token.set_extension("a", default=1, force=True)
|
||||
Token.set_extension("b", default=2, force=True)
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("A", None, [{"_": {"a": 1}}])
|
||||
matcher.add("B", None, [{"_": {"b": 2}}])
|
||||
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
|
||||
assert len(matches) == 4
|
||||
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
|
||||
|
||||
|
||||
def test_issue_1971_4(en_vocab):
|
||||
"""Test that pattern matches correctly with multiple extension attribute
|
||||
values on a single token.
|
||||
"""
|
||||
Token.set_extension("ext_a", default="str_a", force=True)
|
||||
Token.set_extension("ext_b", default="str_b", force=True)
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(en_vocab, words=["this", "is", "text"])
|
||||
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
|
||||
matcher.add("TEST", None, pattern)
|
||||
matches = matcher(doc)
|
||||
# Uncommenting this caused a segmentation fault
|
||||
assert len(matches) == 1
|
||||
|
|
|
@ -1,62 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Token, Doc
|
||||
|
||||
|
||||
def test_issue1971(en_vocab):
|
||||
# Possibly related to #2675 and #2671?
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [
|
||||
{"ORTH": "Doe"},
|
||||
{"ORTH": "!", "OP": "?"},
|
||||
{"_": {"optional": True}, "OP": "?"},
|
||||
{"ORTH": "!", "OP": "?"},
|
||||
]
|
||||
Token.set_extension("optional", default=False)
|
||||
matcher.add("TEST", None, pattern)
|
||||
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
|
||||
# We could also assert length 1 here, but this is more conclusive, because
|
||||
# the real problem here is that it returns a duplicate match for a match_id
|
||||
# that's not actually in the vocab!
|
||||
matches = matcher(doc)
|
||||
assert all([match_id in en_vocab.strings for match_id, start, end in matches])
|
||||
|
||||
|
||||
def test_issue_1971_2(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
|
||||
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
|
||||
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
|
||||
matcher.add("TEST1", None, pattern1, pattern2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
|
||||
|
||||
def test_issue_1971_3(en_vocab):
|
||||
"""Test that pattern matches correctly for multiple extension attributes."""
|
||||
Token.set_extension("a", default=1, force=True)
|
||||
Token.set_extension("b", default=2, force=True)
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("A", None, [{"_": {"a": 1}}])
|
||||
matcher.add("B", None, [{"_": {"b": 2}}])
|
||||
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
|
||||
assert len(matches) == 4
|
||||
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
|
||||
|
||||
|
||||
def test_issue_1971_4(en_vocab):
|
||||
"""Test that pattern matches correctly with multiple extension attribute
|
||||
values on a single token.
|
||||
"""
|
||||
Token.set_extension("ext_a", default="str_a", force=True)
|
||||
Token.set_extension("ext_b", default="str_b", force=True)
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(en_vocab, words=["this", "is", "text"])
|
||||
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
|
||||
matcher.add("TEST", None, pattern)
|
||||
matches = matcher(doc)
|
||||
# Uncommenting this caused a segmentation fault
|
||||
assert len(matches) == 1
|
|
@ -2,15 +2,18 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy import displacy
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.ja import Japanese
|
||||
from spacy.lang.xx import MultiLanguage
|
||||
from spacy.language import Language
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.compat import pickle
|
||||
from spacy._ml import link_vectors_to_models
|
||||
import numpy
|
||||
import random
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
@ -54,6 +57,25 @@ def test_issue2626_2835(en_tokenizer, text):
|
|||
assert doc
|
||||
|
||||
|
||||
def test_issue2656(en_tokenizer):
|
||||
"""Test that tokenizer correctly splits of punctuation after numbers with
|
||||
decimal points.
|
||||
"""
|
||||
doc = en_tokenizer("I went for 40.3, and got home by 10.0.")
|
||||
assert len(doc) == 11
|
||||
assert doc[0].text == "I"
|
||||
assert doc[1].text == "went"
|
||||
assert doc[2].text == "for"
|
||||
assert doc[3].text == "40.3"
|
||||
assert doc[4].text == ","
|
||||
assert doc[5].text == "and"
|
||||
assert doc[6].text == "got"
|
||||
assert doc[7].text == "home"
|
||||
assert doc[8].text == "by"
|
||||
assert doc[9].text == "10.0"
|
||||
assert doc[10].text == "."
|
||||
|
||||
|
||||
def test_issue2671():
|
||||
"""Ensure the correct entity ID is returned for matches with quantifiers.
|
||||
See also #2675
|
||||
|
@ -77,6 +99,17 @@ def test_issue2671():
|
|||
assert nlp.vocab.strings[match_id] == pattern_id
|
||||
|
||||
|
||||
def test_issue2728(en_vocab):
|
||||
"""Test that displaCy ENT visualizer escapes HTML correctly."""
|
||||
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
|
||||
doc.ents = [Span(doc, 0, 1, label="TEST")]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert "<RELEASE>" in html
|
||||
doc.ents = [Span(doc, 1, 2, label="TEST")]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert "<RELEASE>" in html
|
||||
|
||||
|
||||
def test_issue2754(en_tokenizer):
|
||||
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
|
||||
a = en_tokenizer("a")
|
||||
|
@ -106,6 +139,48 @@ def test_issue2782(text, lang_cls):
|
|||
assert doc[0].like_num
|
||||
|
||||
|
||||
def test_issue2800():
|
||||
"""Test issue that arises when too many labels are added to NER model.
|
||||
Used to cause segfault.
|
||||
"""
|
||||
train_data = []
|
||||
train_data.extend([("One sentence", {"entities": []})])
|
||||
entity_types = [str(i) for i in range(1000)]
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
for entity_type in list(entity_types):
|
||||
ner.add_label(entity_type)
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(20):
|
||||
losses = {}
|
||||
random.shuffle(train_data)
|
||||
for statement, entities in train_data:
|
||||
nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5)
|
||||
|
||||
|
||||
def test_issue2822(it_tokenizer):
|
||||
"""Test that the abbreviation of poco is kept as one word."""
|
||||
doc = it_tokenizer("Vuoi un po' di zucchero?")
|
||||
assert len(doc) == 6
|
||||
assert doc[0].text == "Vuoi"
|
||||
assert doc[1].text == "un"
|
||||
assert doc[2].text == "po'"
|
||||
assert doc[2].lemma_ == "poco"
|
||||
assert doc[3].text == "di"
|
||||
assert doc[4].text == "zucchero"
|
||||
assert doc[5].text == "?"
|
||||
|
||||
|
||||
def test_issue2833(en_vocab):
|
||||
"""Test that a custom error is raised if a token or span is pickled."""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
with pytest.raises(NotImplementedError):
|
||||
pickle.dumps(doc[0])
|
||||
with pytest.raises(NotImplementedError):
|
||||
pickle.dumps(doc[0:2])
|
||||
|
||||
|
||||
def test_issue2871():
|
||||
"""Test that vectors recover the correct key for spaCy reserved words."""
|
||||
words = ["dog", "cat", "SUFFIX"]
|
||||
|
@ -134,3 +209,19 @@ def test_issue2901():
|
|||
|
||||
doc = nlp("pythonが大好きです")
|
||||
assert doc
|
||||
|
||||
|
||||
def test_issue2926(fr_tokenizer):
|
||||
"""Test that the tokenizer correctly splits tokens separated by a slash (/)
|
||||
ending in a digit.
|
||||
"""
|
||||
doc = fr_tokenizer("Learn html5/css3/javascript/jquery")
|
||||
assert len(doc) == 8
|
||||
assert doc[0].text == "Learn"
|
||||
assert doc[1].text == "html5"
|
||||
assert doc[2].text == "/"
|
||||
assert doc[3].text == "css3"
|
||||
assert doc[4].text == "/"
|
||||
assert doc[5].text == "javascript"
|
||||
assert doc[6].text == "/"
|
||||
assert doc[7].text == "jquery"
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
def test_issue2656():
|
||||
""" Test that tokenizer correctly splits of punctuation after numbers with decimal points """
|
||||
text = "I went for 40.3, and got home by 10.0."
|
||||
nlp = English()
|
||||
doc = nlp(text)
|
||||
|
||||
assert len(doc) == 11
|
||||
|
||||
assert doc[0].text == "I"
|
||||
assert doc[1].text == "went"
|
||||
assert doc[2].text == "for"
|
||||
assert doc[3].text == "40.3"
|
||||
assert doc[4].text == ","
|
||||
assert doc[5].text == "and"
|
||||
assert doc[6].text == "got"
|
||||
assert doc[7].text == "home"
|
||||
assert doc[8].text == "by"
|
||||
assert doc[9].text == "10.0"
|
||||
assert doc[10].text == "."
|
|
@ -1,16 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy import displacy
|
||||
from spacy.tokens import Doc, Span
|
||||
|
||||
|
||||
def test_issue2728(en_vocab):
|
||||
"""Test that displaCy ENT visualizer escapes HTML correctly."""
|
||||
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
|
||||
doc.ents = [Span(doc, 0, 1, label="TEST")]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert "<RELEASE>" in html
|
||||
doc.ents = [Span(doc, 1, 2, label="TEST")]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert "<RELEASE>" in html
|
|
@ -1,21 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from spacy.lang.it import Italian
|
||||
|
||||
|
||||
def test_issue2822():
|
||||
""" Test that the abbreviation of poco is kept as one word """
|
||||
nlp = Italian()
|
||||
text = "Vuoi un po' di zucchero?"
|
||||
|
||||
doc = nlp(text)
|
||||
|
||||
assert len(doc) == 6
|
||||
|
||||
assert doc[0].text == "Vuoi"
|
||||
assert doc[1].text == "un"
|
||||
assert doc[2].text == "po'"
|
||||
assert doc[2].lemma_ == "poco"
|
||||
assert doc[3].text == "di"
|
||||
assert doc[4].text == "zucchero"
|
||||
assert doc[5].text == "?"
|
|
@ -1,15 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from spacy.compat import pickle
|
||||
|
||||
|
||||
def test_issue2833(en_vocab):
|
||||
"""Test that a custom error is raised if a token or span is pickled."""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
with pytest.raises(NotImplementedError):
|
||||
pickle.dumps(doc[0])
|
||||
with pytest.raises(NotImplementedError):
|
||||
pickle.dumps(doc[0:2])
|
|
@ -1,21 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from spacy.lang.fr import French
|
||||
|
||||
|
||||
def test_issue2926():
|
||||
""" Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """
|
||||
nlp = French()
|
||||
text = "Learn html5/css3/javascript/jquery"
|
||||
doc = nlp(text)
|
||||
|
||||
assert len(doc) == 8
|
||||
|
||||
assert doc[0].text == "Learn"
|
||||
assert doc[1].text == "html5"
|
||||
assert doc[2].text == "/"
|
||||
assert doc[3].text == "css3"
|
||||
assert doc[4].text == "/"
|
||||
assert doc[5].text == "javascript"
|
||||
assert doc[6].text == "/"
|
||||
assert doc[7].text == "jquery"
|
|
@ -2,10 +2,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue3209():
|
||||
"""Test issue that occurred in spaCy nightly where NER labels were being
|
||||
mapped to classes incorrectly after loading the model, when the labels
|
||||
|
|
|
@ -66,9 +66,13 @@ cdef class Retokenizer:
|
|||
for extension in extensions:
|
||||
_validate_extensions(extension)
|
||||
attrs = {key: value for key, value in attrs.items() if key != "_"}
|
||||
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
||||
# will only "intify" the keys, not the values
|
||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||
attrs["_"] = extensions
|
||||
else:
|
||||
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
||||
# will only "intify" the keys, not the values
|
||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||
head_offsets = []
|
||||
for head in heads:
|
||||
|
@ -153,7 +157,11 @@ def _merge(Doc doc, int start, int end, attributes):
|
|||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
# Set attributes on both token and lexeme to take care of token
|
||||
# attribute vs. lexical attribute without having to enumerate them.
|
||||
# If an attribute name is not valid, set_struct_attr will ignore it.
|
||||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
||||
# Make sure ent_iob remains consistent
|
||||
if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
|
||||
if token.ent_type == doc.c[end].ent_type:
|
||||
|
@ -216,6 +224,7 @@ def _bulk_merge(Doc doc, merges):
|
|||
"""
|
||||
cdef Span span
|
||||
cdef const LexemeC* lex
|
||||
cdef TokenC* token
|
||||
cdef Pool mem = Pool()
|
||||
tokens = <TokenC**>mem.alloc(len(merges), sizeof(TokenC))
|
||||
spans = []
|
||||
|
@ -231,15 +240,6 @@ def _bulk_merge(Doc doc, merges):
|
|||
# House the new merged token where it starts
|
||||
token = &doc.c[start]
|
||||
tokens[merge_index] = token
|
||||
# Assign attributes
|
||||
for attr_name, attr_value in attributes.items():
|
||||
if attr_name == "_": # Set extension attributes
|
||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||
doc[start]._.set(ext_attr_key, ext_attr_value)
|
||||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
||||
# for the merged region. To do this, we create a boolean array indicating
|
||||
# whether the row is to be deleted, then use numpy.delete
|
||||
|
@ -255,14 +255,30 @@ def _bulk_merge(Doc doc, merges):
|
|||
# We update token.lex after keeping span root and dep, since
|
||||
# setting token.lex will change span.start and span.end properties
|
||||
# as it modifies the character offsets in the doc
|
||||
for token_index in range(len(merges)):
|
||||
for token_index, (span, attributes) in enumerate(merges):
|
||||
new_orth = ''.join([t.text_with_ws for t in spans[token_index]])
|
||||
if spans[token_index][-1].whitespace_:
|
||||
new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)]
|
||||
token = tokens[token_index]
|
||||
lex = doc.vocab.get(doc.mem, new_orth)
|
||||
tokens[token_index].lex = lex
|
||||
token.lex = lex
|
||||
# We set trailing space here too
|
||||
tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
|
||||
token.spacy = doc.c[spans[token_index].end-1].spacy
|
||||
py_token = span[0]
|
||||
# Assign attributes
|
||||
for attr_name, attr_value in attributes.items():
|
||||
if attr_name == "_": # Set extension attributes
|
||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||
py_token._.set(ext_attr_key, ext_attr_value)
|
||||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
# Set attributes on both token and lexeme to take care of token
|
||||
# attribute vs. lexical attribute without having to enumerate
|
||||
# them. If an attribute name is not valid, set_struct_attr will
|
||||
# ignore it.
|
||||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
||||
# Begin by setting all the head indices to absolute token positions
|
||||
# This is easier to work with for now than the offsets
|
||||
# Before thinking of something simpler, beware the case where a
|
||||
|
@ -281,7 +297,7 @@ def _bulk_merge(Doc doc, merges):
|
|||
current_offset = 0
|
||||
for i in range(doc.length):
|
||||
if current_span_index < len(spans) and i == spans[current_span_index].end:
|
||||
#last token was the last of the span
|
||||
# Last token was the last of the span
|
||||
current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
|
||||
current_span_index += 1
|
||||
if current_span_index < len(spans) and \
|
||||
|
@ -405,10 +421,17 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
|||
if attr_name == "_":
|
||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||
doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
|
||||
# NB: We need to call get_string_id here because only the keys are
|
||||
# "intified" (since we support "KEY": [value, value] syntax here).
|
||||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
|
||||
else:
|
||||
# Set attributes on both token and lexeme to take care of token
|
||||
# attribute vs. lexical attribute without having to enumerate
|
||||
# them. If an attribute name is not valid, set_struct_attr will
|
||||
# ignore it.
|
||||
Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
|
||||
Lexeme.set_struct_attr(<LexemeC*>token.lex, attr_name, get_string_id(attr_value))
|
||||
# Assign correct dependencies to the inner token
|
||||
for i, head in enumerate(heads):
|
||||
doc.c[token_index + i].head = head
|
||||
|
|
|
@ -402,9 +402,11 @@ invalidated, although they may accidentally continue to work.
|
|||
|
||||
### Retokenizer.merge {#retokenizer.merge tag="method"}
|
||||
|
||||
Mark a span for merging. The `attrs` will be applied to the resulting token.
|
||||
Writable custom extension attributes can be provided as a dictionary mapping
|
||||
attribute names to values as the `"_"` key.
|
||||
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
||||
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
||||
underlying lexeme (if they're context-independent lexical attributes like
|
||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a
|
||||
dictionary mapping attribute names to values as the `"_"` key.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -431,7 +433,10 @@ second subtoken of `doc[3]`. This mechanism allows attaching subtokens to other
|
|||
newly created subtokens, without having to keep track of the changing token
|
||||
indices. If the specified head token will be split within the retokenizer block
|
||||
and no subtoken index is specified, it will default to `0`. Attributes to set on
|
||||
subtokens can be provided as a list of values.
|
||||
subtokens can be provided as a list of values. They'll be applied to the
|
||||
resulting token (if they're context-dependent token attributes like `LEMMA` or
|
||||
`DEP`) or to the underlying lexeme (if they're context-independent lexical
|
||||
attributes like `LOWER` or `IS_STOP`).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -995,6 +995,14 @@ with doc.retokenize() as retokenizer:
|
|||
print("After:", [token.text for token in doc])
|
||||
```
|
||||
|
||||
If an attribute in the `attrs` is a context-dependent token attribute, it will
|
||||
be applied to the underlying [`Token`](/api/token). For example `LEMMA`, `POS`
|
||||
or `DEP` only apply to a word in context, so they're token attributes. If an
|
||||
attribute is a context-independent lexical attribute, it will be applied to the
|
||||
underlying [`Lexeme`](/api/lexeme), the entry in the vocabulary. For example,
|
||||
`LOWER` or `IS_STOP` apply to all words of the same spelling, regardless of the
|
||||
context.
|
||||
|
||||
<Infobox title="Tip: merging entities and noun phrases">
|
||||
|
||||
If you need to merge named entities or noun chunks, check out the built-in
|
||||
|
|
Loading…
Reference in New Issue
Block a user