2020-07-06 15:05:59 +03:00
import pytest
from spacy . language import Language
from spacy . vocab import Vocab
from spacy . pipeline import EntityRuler , DependencyParser
2020-07-22 14:42:59 +03:00
from spacy . pipeline . dep_parser import DEFAULT_PARSER_MODEL
2020-07-06 15:05:59 +03:00
from spacy import displacy , load
from spacy . displacy import parse_deps
from spacy . tokens import Doc , Token
from spacy . matcher import Matcher , PhraseMatcher
from spacy . errors import MatchPatternError
from spacy . util import minibatch
from spacy . gold import Example
from spacy . lang . hi import Hindi
from spacy . lang . es import Spanish
from spacy . lang . en import English
from spacy . attrs import IS_ALPHA
2020-07-22 14:42:59 +03:00
from spacy import registry
2020-07-06 15:05:59 +03:00
from thinc . api import compounding
import spacy
import srsly
import numpy
from . . util import make_tempdir , get_doc
@pytest.mark.parametrize ( " word " , [ " don ' t " , " don’ t " , " I ' d " , " I’ d " ] )
def test_issue3521 ( en_tokenizer , word ) :
tok = en_tokenizer ( word ) [ 1 ]
# 'not' and 'would' should be stopwords, also in their abbreviated forms
assert tok . is_stop
def test_issue_3526_1 ( en_vocab ) :
patterns = [
{ " label " : " HELLO " , " pattern " : " hello world " } ,
{ " label " : " BYE " , " pattern " : [ { " LOWER " : " bye " } , { " LOWER " : " bye " } ] } ,
{ " label " : " HELLO " , " pattern " : [ { " ORTH " : " HELLO " } ] } ,
{ " label " : " COMPLEX " , " pattern " : [ { " ORTH " : " foo " , " OP " : " * " } ] } ,
{ " label " : " TECH_ORG " , " pattern " : " Apple " , " id " : " a1 " } ,
]
nlp = Language ( vocab = en_vocab )
ruler = EntityRuler ( nlp , patterns = patterns , overwrite_ents = True )
ruler_bytes = ruler . to_bytes ( )
assert len ( ruler ) == len ( patterns )
assert len ( ruler . labels ) == 4
assert ruler . overwrite
new_ruler = EntityRuler ( nlp )
new_ruler = new_ruler . from_bytes ( ruler_bytes )
assert len ( new_ruler ) == len ( ruler )
assert len ( new_ruler . labels ) == 4
assert new_ruler . overwrite == ruler . overwrite
assert new_ruler . ent_id_sep == ruler . ent_id_sep
def test_issue_3526_2 ( en_vocab ) :
patterns = [
{ " label " : " HELLO " , " pattern " : " hello world " } ,
{ " label " : " BYE " , " pattern " : [ { " LOWER " : " bye " } , { " LOWER " : " bye " } ] } ,
{ " label " : " HELLO " , " pattern " : [ { " ORTH " : " HELLO " } ] } ,
{ " label " : " COMPLEX " , " pattern " : [ { " ORTH " : " foo " , " OP " : " * " } ] } ,
{ " label " : " TECH_ORG " , " pattern " : " Apple " , " id " : " a1 " } ,
]
nlp = Language ( vocab = en_vocab )
ruler = EntityRuler ( nlp , patterns = patterns , overwrite_ents = True )
bytes_old_style = srsly . msgpack_dumps ( ruler . patterns )
new_ruler = EntityRuler ( nlp )
new_ruler = new_ruler . from_bytes ( bytes_old_style )
assert len ( new_ruler ) == len ( ruler )
for pattern in ruler . patterns :
assert pattern in new_ruler . patterns
assert new_ruler . overwrite is not ruler . overwrite
def test_issue_3526_3 ( en_vocab ) :
patterns = [
{ " label " : " HELLO " , " pattern " : " hello world " } ,
{ " label " : " BYE " , " pattern " : [ { " LOWER " : " bye " } , { " LOWER " : " bye " } ] } ,
{ " label " : " HELLO " , " pattern " : [ { " ORTH " : " HELLO " } ] } ,
{ " label " : " COMPLEX " , " pattern " : [ { " ORTH " : " foo " , " OP " : " * " } ] } ,
{ " label " : " TECH_ORG " , " pattern " : " Apple " , " id " : " a1 " } ,
]
nlp = Language ( vocab = en_vocab )
ruler = EntityRuler ( nlp , patterns = patterns , overwrite_ents = True )
with make_tempdir ( ) as tmpdir :
out_file = tmpdir / " entity_ruler "
srsly . write_jsonl ( out_file . with_suffix ( " .jsonl " ) , ruler . patterns )
new_ruler = EntityRuler ( nlp ) . from_disk ( out_file )
for pattern in ruler . patterns :
assert pattern in new_ruler . patterns
assert len ( new_ruler ) == len ( ruler )
assert new_ruler . overwrite is not ruler . overwrite
@pytest.mark.filterwarnings ( " ignore::UserWarning " )
def test_issue_3526_4 ( en_vocab ) :
nlp = Language ( vocab = en_vocab )
2020-07-22 14:42:59 +03:00
patterns = [ { " label " : " ORG " , " pattern " : " Apple " } ]
config = { " overwrite_ents " : True }
ruler = nlp . add_pipe ( " entity_ruler " , config = config )
ruler . add_patterns ( patterns )
2020-07-06 15:05:59 +03:00
with make_tempdir ( ) as tmpdir :
nlp . to_disk ( tmpdir )
ruler = nlp . get_pipe ( " entity_ruler " )
assert ruler . patterns == [ { " label " : " ORG " , " pattern " : " Apple " } ]
assert ruler . overwrite is True
nlp2 = load ( tmpdir )
new_ruler = nlp2 . get_pipe ( " entity_ruler " )
assert new_ruler . patterns == [ { " label " : " ORG " , " pattern " : " Apple " } ]
assert new_ruler . overwrite is True
def test_issue3531 ( ) :
""" Test that displaCy renderer doesn ' t require " settings " key. """
example_dep = {
" words " : [
{ " text " : " But " , " tag " : " CCONJ " } ,
{ " text " : " Google " , " tag " : " PROPN " } ,
{ " text " : " is " , " tag " : " VERB " } ,
{ " text " : " starting " , " tag " : " VERB " } ,
{ " text " : " from " , " tag " : " ADP " } ,
{ " text " : " behind. " , " tag " : " ADV " } ,
] ,
" arcs " : [
{ " start " : 0 , " end " : 3 , " label " : " cc " , " dir " : " left " } ,
{ " start " : 1 , " end " : 3 , " label " : " nsubj " , " dir " : " left " } ,
{ " start " : 2 , " end " : 3 , " label " : " aux " , " dir " : " left " } ,
{ " start " : 3 , " end " : 4 , " label " : " prep " , " dir " : " right " } ,
{ " start " : 4 , " end " : 5 , " label " : " pcomp " , " dir " : " right " } ,
] ,
}
example_ent = {
" text " : " But Google is starting from behind. " ,
" ents " : [ { " start " : 4 , " end " : 10 , " label " : " ORG " } ] ,
}
dep_html = displacy . render ( example_dep , style = " dep " , manual = True )
assert dep_html
ent_html = displacy . render ( example_ent , style = " ent " , manual = True )
assert ent_html
def test_issue3540 ( en_vocab ) :
words = [ " I " , " live " , " in " , " NewYork " , " right " , " now " ]
tensor = numpy . asarray (
[ [ 1.0 , 1.1 ] , [ 2.0 , 2.1 ] , [ 3.0 , 3.1 ] , [ 4.0 , 4.1 ] , [ 5.0 , 5.1 ] , [ 6.0 , 6.1 ] ] ,
dtype = " f " ,
)
doc = Doc ( en_vocab , words = words )
doc . tensor = tensor
gold_text = [ " I " , " live " , " in " , " NewYork " , " right " , " now " ]
assert [ token . text for token in doc ] == gold_text
gold_lemma = [ " I " , " live " , " in " , " NewYork " , " right " , " now " ]
2020-08-07 16:27:13 +03:00
for i , lemma in enumerate ( gold_lemma ) :
doc [ i ] . lemma_ = lemma
2020-07-06 15:05:59 +03:00
assert [ token . lemma_ for token in doc ] == gold_lemma
vectors_1 = [ token . vector for token in doc ]
assert len ( vectors_1 ) == len ( doc )
with doc . retokenize ( ) as retokenizer :
heads = [ ( doc [ 3 ] , 1 ) , doc [ 2 ] ]
2020-08-07 16:27:13 +03:00
attrs = { " POS " : [ " PROPN " , " PROPN " ] , " LEMMA " : [ " New " , " York " ] , " DEP " : [ " pobj " , " compound " ] }
2020-07-06 15:05:59 +03:00
retokenizer . split ( doc [ 3 ] , [ " New " , " York " ] , heads = heads , attrs = attrs )
gold_text = [ " I " , " live " , " in " , " New " , " York " , " right " , " now " ]
assert [ token . text for token in doc ] == gold_text
gold_lemma = [ " I " , " live " , " in " , " New " , " York " , " right " , " now " ]
assert [ token . lemma_ for token in doc ] == gold_lemma
vectors_2 = [ token . vector for token in doc ]
assert len ( vectors_2 ) == len ( doc )
assert vectors_1 [ 0 ] . tolist ( ) == vectors_2 [ 0 ] . tolist ( )
assert vectors_1 [ 1 ] . tolist ( ) == vectors_2 [ 1 ] . tolist ( )
assert vectors_1 [ 2 ] . tolist ( ) == vectors_2 [ 2 ] . tolist ( )
assert vectors_1 [ 4 ] . tolist ( ) == vectors_2 [ 5 ] . tolist ( )
assert vectors_1 [ 5 ] . tolist ( ) == vectors_2 [ 6 ] . tolist ( )
def test_issue3549 ( en_vocab ) :
""" Test that match pattern validation doesn ' t raise on empty errors. """
matcher = Matcher ( en_vocab , validate = True )
pattern = [ { " LOWER " : " hello " } , { " LOWER " : " world " } ]
matcher . add ( " GOOD " , [ pattern ] )
with pytest . raises ( MatchPatternError ) :
matcher . add ( " BAD " , [ [ { " X " : " Y " } ] ] )
2020-07-20 15:49:54 +03:00
@pytest.mark.skip ( " Matching currently only works on strings and integers " )
2020-07-06 15:05:59 +03:00
def test_issue3555 ( en_vocab ) :
""" Test that custom extensions with default None don ' t break matcher. """
Token . set_extension ( " issue3555 " , default = None )
matcher = Matcher ( en_vocab )
2020-07-20 15:49:54 +03:00
pattern = [ { " ORTH " : " have " } , { " _ " : { " issue3555 " : True } } ]
2020-07-06 15:05:59 +03:00
matcher . add ( " TEST " , [ pattern ] )
doc = Doc ( en_vocab , words = [ " have " , " apple " ] )
matcher ( doc )
def test_issue3611 ( ) :
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
unique_classes = [ " offensive " , " inoffensive " ]
x_train = [
" This is an offensive text " ,
" This is the second offensive text " ,
" inoff " ,
]
y_train = [ " offensive " , " offensive " , " inoffensive " ]
nlp = spacy . blank ( " en " )
# preparing the data
train_data = [ ]
for text , train_instance in zip ( x_train , y_train ) :
cat_dict = { label : label == train_instance for label in unique_classes }
train_data . append ( Example . from_dict ( nlp . make_doc ( text ) , { " cats " : cat_dict } ) )
# add a text categorizer component
2020-07-22 14:42:59 +03:00
model = {
" @architectures " : " spacy.TextCatBOW.v1 " ,
" exclusive_classes " : True ,
" ngram_size " : 2 ,
" no_output_layer " : False ,
}
textcat = nlp . add_pipe ( " textcat " , config = { " model " : model } , last = True )
2020-07-06 15:05:59 +03:00
for label in unique_classes :
textcat . add_label ( label )
# training the network
with nlp . select_pipes ( enable = " textcat " ) :
2020-07-22 14:42:59 +03:00
optimizer = nlp . begin_training ( )
2020-07-06 15:05:59 +03:00
for i in range ( 3 ) :
losses = { }
batches = minibatch ( train_data , size = compounding ( 4.0 , 32.0 , 1.001 ) )
for batch in batches :
nlp . update (
examples = batch , sgd = optimizer , drop = 0.1 , losses = losses ,
)
def test_issue3625 ( ) :
""" Test that default punctuation rules applies to hindi unicode characters """
nlp = Hindi ( )
doc = nlp ( " hi. how हुए. होटल, होटल " )
expected = [ " hi " , " . " , " how " , " हुए " , " . " , " होटल " , " , " , " होटल " ]
assert [ token . text for token in doc ] == expected
def test_issue3803 ( ) :
""" Test that spanish num-like tokens have True for like_num attribute. """
nlp = Spanish ( )
text = " 2 dos 1000 mil 12 doce "
doc = nlp ( text )
assert [ t . like_num for t in doc ] == [ True , True , True , True , True , True ]
@pytest.mark.filterwarnings ( " ignore::UserWarning " )
def test_issue3830_no_subtok ( ) :
""" Test that the parser doesn ' t have subtok label if not learn_tokens """
config = {
" learn_tokens " : False ,
" min_action_freq " : 30 ,
2020-07-22 14:42:59 +03:00
" update_with_oracle_cut_size " : 100 ,
2020-07-06 15:05:59 +03:00
}
2020-07-22 14:42:59 +03:00
model = registry . make_from_config ( { " model " : DEFAULT_PARSER_MODEL } , validate = True ) [
" model "
]
parser = DependencyParser ( Vocab ( ) , model , * * config )
2020-07-06 15:05:59 +03:00
parser . add_label ( " nsubj " )
assert " subtok " not in parser . labels
parser . begin_training ( lambda : [ ] )
assert " subtok " not in parser . labels
@pytest.mark.filterwarnings ( " ignore::UserWarning " )
def test_issue3830_with_subtok ( ) :
""" Test that the parser does have subtok label if learn_tokens=True. """
config = {
" learn_tokens " : True ,
" min_action_freq " : 30 ,
2020-07-22 14:42:59 +03:00
" update_with_oracle_cut_size " : 100 ,
2020-07-06 15:05:59 +03:00
}
2020-07-22 14:42:59 +03:00
model = registry . make_from_config ( { " model " : DEFAULT_PARSER_MODEL } , validate = True ) [
" model "
]
parser = DependencyParser ( Vocab ( ) , model , * * config )
2020-07-06 15:05:59 +03:00
parser . add_label ( " nsubj " )
assert " subtok " not in parser . labels
parser . begin_training ( lambda : [ ] )
assert " subtok " in parser . labels
def test_issue3839 ( en_vocab ) :
""" Test that match IDs returned by the matcher are correct, are in the string """
doc = Doc ( en_vocab , words = [ " terrific " , " group " , " of " , " people " ] )
matcher = Matcher ( en_vocab )
match_id = " PATTERN "
pattern1 = [ { " LOWER " : " terrific " } , { " OP " : " ? " } , { " LOWER " : " group " } ]
pattern2 = [ { " LOWER " : " terrific " } , { " OP " : " ? " } , { " OP " : " ? " } , { " LOWER " : " group " } ]
matcher . add ( match_id , [ pattern1 ] )
matches = matcher ( doc )
assert matches [ 0 ] [ 0 ] == en_vocab . strings [ match_id ]
matcher = Matcher ( en_vocab )
matcher . add ( match_id , [ pattern2 ] )
matches = matcher ( doc )
assert matches [ 0 ] [ 0 ] == en_vocab . strings [ match_id ]
@pytest.mark.parametrize (
" sentence " ,
[
" The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction. " ,
" The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale ' s #1. " ,
" The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale ' s number one " ,
" Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions. " ,
" It was a missed assignment, but it shouldn ' t have resulted in a turnover ... " ,
] ,
)
def test_issue3869 ( sentence ) :
""" Test that the Doc ' s count_by function works consistently """
nlp = English ( )
doc = nlp ( sentence )
count = 0
for token in doc :
count + = token . is_alpha
assert count == doc . count_by ( IS_ALPHA ) . get ( 1 , 0 )
def test_issue3879 ( en_vocab ) :
doc = Doc ( en_vocab , words = [ " This " , " is " , " a " , " test " , " . " ] )
assert len ( doc ) == 5
pattern = [ { " ORTH " : " This " , " OP " : " ? " } , { " OP " : " ? " } , { " ORTH " : " test " } ]
matcher = Matcher ( en_vocab )
matcher . add ( " TEST " , [ pattern ] )
assert len ( matcher ( doc ) ) == 2 # fails because of a FP match 'is a test'
@pytest.mark.filterwarnings ( " ignore::UserWarning " )
def test_issue3880 ( ) :
""" Test that `nlp.pipe()` works when an empty string ends the batch.
Fixed in v7 .0 .5 of Thinc .
"""
texts = [ " hello " , " world " , " " , " " ]
nlp = English ( )
2020-07-22 14:42:59 +03:00
nlp . add_pipe ( " parser " ) . add_label ( " dep " )
nlp . add_pipe ( " ner " ) . add_label ( " PERSON " )
nlp . add_pipe ( " tagger " ) . add_label ( " NN " )
2020-07-06 15:05:59 +03:00
nlp . begin_training ( )
for doc in nlp . pipe ( texts ) :
pass
def test_issue3882 ( en_vocab ) :
""" Test that displaCy doesn ' t serialize the doc.user_data when making a
copy of the Doc .
"""
doc = Doc ( en_vocab , words = [ " Hello " , " world " ] )
doc . is_parsed = True
doc . user_data [ " test " ] = set ( )
parse_deps ( doc )
def test_issue3951 ( en_vocab ) :
""" Test that combinations of optional rules are matched correctly. """
matcher = Matcher ( en_vocab )
pattern = [
{ " LOWER " : " hello " } ,
{ " LOWER " : " this " , " OP " : " ? " } ,
{ " OP " : " ? " } ,
{ " LOWER " : " world " } ,
]
matcher . add ( " TEST " , [ pattern ] )
doc = Doc ( en_vocab , words = [ " Hello " , " my " , " new " , " world " ] )
matches = matcher ( doc )
assert len ( matches ) == 0
def test_issue3959 ( ) :
""" Ensure that a modified pos attribute is serialized correctly. """
nlp = English ( )
doc = nlp (
" displaCy uses JavaScript, SVG and CSS to show you how computers understand language "
)
assert doc [ 0 ] . pos_ == " "
doc [ 0 ] . pos_ = " NOUN "
assert doc [ 0 ] . pos_ == " NOUN "
# usually this is already True when starting from proper models instead of blank English
doc . is_tagged = True
with make_tempdir ( ) as tmp_dir :
file_path = tmp_dir / " my_doc "
doc . to_disk ( file_path )
doc2 = nlp ( " " )
doc2 . from_disk ( file_path )
assert doc2 [ 0 ] . pos_ == " NOUN "
def test_issue3962 ( en_vocab ) :
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise . """
# fmt: off
words = [ " He " , " jests " , " at " , " scars " , " , " , " that " , " never " , " felt " , " a " , " wound " , " . " ]
heads = [ 1 , 6 , - 1 , - 1 , 3 , 2 , 1 , 0 , 1 , - 2 , - 3 ]
deps = [ " nsubj " , " ccomp " , " prep " , " pobj " , " punct " , " nsubj " , " neg " , " ROOT " , " det " , " dobj " , " punct " ]
# fmt: on
doc = get_doc ( en_vocab , words = words , heads = heads , deps = deps )
span2 = doc [ 1 : 5 ] # "jests at scars ,"
doc2 = span2 . as_doc ( )
doc2_json = doc2 . to_json ( )
assert doc2_json
# head set to itself, being the new artificial root
assert doc2 [ 0 ] . head . text == " jests "
assert doc2 [ 0 ] . dep_ == " dep "
assert doc2 [ 1 ] . head . text == " jests "
assert doc2 [ 1 ] . dep_ == " prep "
assert doc2 [ 2 ] . head . text == " at "
assert doc2 [ 2 ] . dep_ == " pobj "
assert doc2 [ 3 ] . head . text == " jests " # head set to the new artificial root
assert doc2 [ 3 ] . dep_ == " dep "
# We should still have 1 sentence
assert len ( list ( doc2 . sents ) ) == 1
span3 = doc [ 6 : 9 ] # "never felt a"
doc3 = span3 . as_doc ( )
doc3_json = doc3 . to_json ( )
assert doc3_json
assert doc3 [ 0 ] . head . text == " felt "
assert doc3 [ 0 ] . dep_ == " neg "
assert doc3 [ 1 ] . head . text == " felt "
assert doc3 [ 1 ] . dep_ == " ROOT "
assert doc3 [ 2 ] . head . text == " felt " # head set to ancestor
assert doc3 [ 2 ] . dep_ == " dep "
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
assert len ( list ( doc3 . sents ) ) == 1
def test_issue3962_long ( en_vocab ) :
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise . """
# fmt: off
words = [ " He " , " jests " , " at " , " scars " , " . " , " They " , " never " , " felt " , " a " , " wound " , " . " ]
heads = [ 1 , 0 , - 1 , - 1 , - 3 , 2 , 1 , 0 , 1 , - 2 , - 3 ]
deps = [ " nsubj " , " ROOT " , " prep " , " pobj " , " punct " , " nsubj " , " neg " , " ROOT " , " det " , " dobj " , " punct " ]
# fmt: on
two_sent_doc = get_doc ( en_vocab , words = words , heads = heads , deps = deps )
span2 = two_sent_doc [ 1 : 7 ] # "jests at scars. They never"
doc2 = span2 . as_doc ( )
doc2_json = doc2 . to_json ( )
assert doc2_json
# head set to itself, being the new artificial root (in sentence 1)
assert doc2 [ 0 ] . head . text == " jests "
assert doc2 [ 0 ] . dep_ == " ROOT "
assert doc2 [ 1 ] . head . text == " jests "
assert doc2 [ 1 ] . dep_ == " prep "
assert doc2 [ 2 ] . head . text == " at "
assert doc2 [ 2 ] . dep_ == " pobj "
assert doc2 [ 3 ] . head . text == " jests "
assert doc2 [ 3 ] . dep_ == " punct "
# head set to itself, being the new artificial root (in sentence 2)
assert doc2 [ 4 ] . head . text == " They "
assert doc2 [ 4 ] . dep_ == " dep "
# head set to the new artificial head (in sentence 2)
assert doc2 [ 4 ] . head . text == " They "
assert doc2 [ 4 ] . dep_ == " dep "
# We should still have 2 sentences
sents = list ( doc2 . sents )
assert len ( sents ) == 2
assert sents [ 0 ] . text == " jests at scars . "
assert sents [ 1 ] . text == " They never "
def test_issue3972 ( en_vocab ) :
""" Test that the PhraseMatcher returns duplicates for duplicate match IDs.
"""
matcher = PhraseMatcher ( en_vocab )
matcher . add ( " A " , [ Doc ( en_vocab , words = [ " New " , " York " ] ) ] )
matcher . add ( " B " , [ Doc ( en_vocab , words = [ " New " , " York " ] ) ] )
doc = Doc ( en_vocab , words = [ " I " , " live " , " in " , " New " , " York " ] )
matches = matcher ( doc )
assert len ( matches ) == 2
# We should have a match for each of the two rules
found_ids = [ en_vocab . strings [ ent_id ] for ( ent_id , _ , _ ) in matches ]
assert " A " in found_ids
assert " B " in found_ids