Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2018-09-28 14:27:35 +02:00
commit e3e9fe18d4
6 changed files with 47 additions and 42 deletions

View File

@ -12,7 +12,7 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
from ...language import Language from ...language import Language
from ...attrs import LANG, LIKE_NUM, NORM from ...attrs import LANG, NORM
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):

View File

@ -5,10 +5,26 @@ import pytest
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.displacy import render from spacy.displacy import render
from spacy.gold import iob_to_biluo from spacy.gold import iob_to_biluo
from spacy.lang.it import Italian
from ..util import add_vecs_to_vocab from ..util import add_vecs_to_vocab
@pytest.mark.xfail
def test_issue2179():
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
nlp = Italian()
ner = nlp.create_pipe('ner')
ner.add_label('CITIZENSHIP')
nlp.add_pipe(ner)
nlp.begin_training()
nlp2 = Italian()
nlp2.add_pipe(nlp2.create_pipe('ner'))
nlp2.from_bytes(nlp.to_bytes())
assert 'extra_labels' not in nlp2.get_pipe('ner').cfg
assert nlp2.get_pipe('ner').labels == ['CITIZENSHIP']
def test_issue2219(en_vocab): def test_issue2219(en_vocab):
vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])] vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
add_vecs_to_vocab(en_vocab, vectors) add_vecs_to_vocab(en_vocab, vectors)

View File

@ -1,16 +0,0 @@
'''Test that spurious 'extra_labels' aren't created when initializing NER.'''
import pytest
from ... import blank
@pytest.mark.xfail
def test_issue2179():
nlp = blank('it')
ner = nlp.create_pipe('ner')
ner.add_label('CITIZENSHIP')
nlp.add_pipe(ner)
nlp.begin_training()
nlp2 = blank('it')
nlp2.add_pipe(nlp2.create_pipe('ner'))
nlp2.from_bytes(nlp.to_bytes())
assert 'extra_labels' not in nlp2.get_pipe('ner').cfg
assert nlp2.get_pipe('ner').labels == ['CITIZENSHIP']

View File

@ -1,11 +1,10 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import spacy
def test_issue2626():
'''Check that this sentence doesn't cause an infinite loop in the tokenizer.''' def test_issue2626(en_tokenizer):
nlp = spacy.blank('en') """Check that sentence doesn't cause an infinite loop in the tokenizer."""
text = """ text = """
ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume
""" """
doc = nlp.make_doc(text) doc = en_tokenizer(text)

View File

@ -1,29 +1,30 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from ...lang.en import English
from ...matcher import Matcher
def get_rule_id(nlp, matcher, doc): import pytest
matches = matcher(doc) from spacy.lang.en import English
for match_id, start, end in matches: from spacy.matcher import Matcher
rule_id = nlp.vocab.strings[match_id]
span = doc[start:end]
return rule_id
def test_issue2671(): def test_issue2671():
'''Ensure the correct entity ID is returned for matches with quantifiers. """Ensure the correct entity ID is returned for matches with quantifiers.
See also #2675 See also #2675
''' """
def get_rule_id(nlp, matcher, doc):
matches = matcher(doc)
for match_id, start, end in matches:
rule_id = nlp.vocab.strings[match_id]
span = doc[start:end]
return rule_id
nlp = English() nlp = English()
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
pattern_id = 'test_pattern'
pattern = [{'LOWER': 'high'}, {'IS_PUNCT': True, 'OP': '?'}, {'LOWER': 'adrenaline'}] pattern = [{'LOWER': 'high'},
matcher.add("test_pattern", None, pattern) {'IS_PUNCT': True, 'OP': '?'},
{'LOWER': 'adrenaline'}]
matcher.add(pattern_id, None, pattern)
doc1 = nlp("This is a high-adrenaline situation.") doc1 = nlp("This is a high-adrenaline situation.")
doc2 = nlp("This is a high adrenaline situation.") doc2 = nlp("This is a high adrenaline situation.")
# Works correctly assert get_rule_id(nlp, matcher, doc1) == pattern_id
assert get_rule_id(nlp, matcher, doc1) == 'test_pattern' assert get_rule_id(nlp, matcher, doc2) == pattern_id
assert get_rule_id(nlp, matcher, doc2) == 'test_pattern'

View File

@ -1,8 +1,13 @@
'''Test that deprojectivization doesn't mess up sentence boundaries.''' # coding: utf-8
from __future__ import unicode_literals
import pytest import pytest
from ..util import get_doc from ..util import get_doc
def test_issue2772(en_vocab): def test_issue2772(en_vocab):
"""Test that deprojectivization doesn't mess up sentence boundaries."""
words = 'When we write or communicate virtually , we can hide our true feelings .'.split() words = 'When we write or communicate virtually , we can hide our true feelings .'.split()
# A tree with a non-projective (i.e. crossing) arc # A tree with a non-projective (i.e. crossing) arc
# The arcs (0, 4) and (2, 9) cross. # The arcs (0, 4) and (2, 9) cross.