2015-02-07 21:14:07 +03:00
import pytest
2017-01-11 20:05:36 +03:00
import numpy
2019-03-10 17:24:34 +03:00
from spacy . tokens import Doc , Span
2018-07-25 00:38:44 +03:00
from spacy . vocab import Vocab
2020-02-16 19:17:09 +03:00
from spacy . attrs import ENT_TYPE , ENT_IOB , SENT_START , HEAD , DEP
2018-07-25 00:38:44 +03:00
from . . util import get_doc
2017-01-11 20:05:36 +03:00
2015-02-07 21:14:07 +03:00
2018-11-27 03:09:36 +03:00
@pytest.mark.parametrize ( " text " , [ [ " one " , " two " , " three " ] ] )
2017-01-11 20:05:36 +03:00
def test_doc_api_compare_by_string_position ( en_vocab , text ) :
2018-07-25 00:38:44 +03:00
doc = Doc ( en_vocab , words = text )
2017-01-09 21:12:00 +03:00
# Get the tokens in this order, so their ID ordering doesn't match the idx
2017-01-11 20:05:36 +03:00
token3 = doc [ - 1 ]
token2 = doc [ - 2 ]
token1 = doc [ - 1 ]
token1 , token2 , token3 = doc
assert token1 < token2 < token3
assert not token1 > token2
assert token2 > token1
assert token2 < = token3
assert token3 > = token1
def test_doc_api_getitem ( en_tokenizer ) :
text = " Give it back! He pleaded. "
tokens = en_tokenizer ( text )
2018-11-27 03:09:36 +03:00
assert tokens [ 0 ] . text == " Give "
assert tokens [ - 1 ] . text == " . "
2015-02-07 21:14:07 +03:00
with pytest . raises ( IndexError ) :
tokens [ len ( tokens ) ]
2015-07-13 19:39:38 +03:00
2015-10-06 11:59:11 +03:00
def to_str ( span ) :
2018-11-27 03:09:36 +03:00
return " / " . join ( token . text for token in span )
2015-10-06 11:59:11 +03:00
2015-10-06 10:51:25 +03:00
span = tokens [ 1 : 1 ]
2015-10-06 11:59:11 +03:00
assert not to_str ( span )
2015-10-06 10:51:25 +03:00
span = tokens [ 1 : 4 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " it/back/! "
2015-10-06 10:56:33 +03:00
span = tokens [ 1 : 4 : 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " it/back/! "
2015-10-06 10:51:25 +03:00
with pytest . raises ( ValueError ) :
tokens [ 1 : 4 : 2 ]
with pytest . raises ( ValueError ) :
tokens [ 1 : 4 : - 1 ]
2015-10-06 11:59:11 +03:00
span = tokens [ - 3 : 6 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded "
2015-10-06 11:59:11 +03:00
span = tokens [ 4 : - 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded "
2015-10-06 11:59:11 +03:00
span = tokens [ - 5 : - 3 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ 5 : 4 ]
assert span . start == span . end == 5 and not to_str ( span )
span = tokens [ 4 : - 3 ]
assert span . start == span . end == 4 and not to_str ( span )
span = tokens [ : ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/!/He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ 4 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ : 4 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ : - 3 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ - 3 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ 4 : 50 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ - 50 : 4 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ - 50 : - 40 ]
assert span . start == span . end == 0 and not to_str ( span )
span = tokens [ 40 : 50 ]
assert span . start == span . end == 7 and not to_str ( span )
2015-10-06 12:08:39 +03:00
span = tokens [ 1 : 4 ]
2018-11-27 03:09:36 +03:00
assert span [ 0 ] . orth_ == " it "
2015-10-06 12:45:49 +03:00
subspan = span [ : ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ : 2 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back "
2015-10-06 12:45:49 +03:00
subspan = span [ 1 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ : - 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back "
2015-10-06 12:45:49 +03:00
subspan = span [ - 2 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ 1 : 2 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back "
2015-10-06 12:45:49 +03:00
subspan = span [ - 2 : - 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back "
2015-10-06 12:45:49 +03:00
subspan = span [ - 50 : 50 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ 50 : - 50 ]
assert subspan . start == subspan . end == 4 and not to_str ( subspan )
2015-10-06 12:08:39 +03:00
2015-07-13 19:39:38 +03:00
2018-11-27 03:09:36 +03:00
@pytest.mark.parametrize (
" text " , [ " Give it back! He pleaded. " , " Give it back! He pleaded. " ]
)
2017-01-11 20:05:36 +03:00
def test_doc_api_serialize ( en_tokenizer , text ) :
tokens = en_tokenizer ( text )
2018-07-25 00:38:44 +03:00
new_tokens = Doc ( tokens . vocab ) . from_bytes ( tokens . to_bytes ( ) )
2017-05-31 00:34:23 +03:00
assert tokens . text == new_tokens . text
2017-01-11 20:05:36 +03:00
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
2015-07-13 19:39:38 +03:00
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2015-08-06 01:35:40 +03:00
2018-07-25 00:38:44 +03:00
new_tokens = Doc ( tokens . vocab ) . from_bytes (
2019-03-10 21:16:45 +03:00
tokens . to_bytes ( exclude = [ " tensor " ] ) , exclude = [ " tensor " ]
2018-11-27 03:09:36 +03:00
)
2018-05-01 14:40:22 +03:00
assert tokens . text == new_tokens . text
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2018-07-25 00:38:44 +03:00
new_tokens = Doc ( tokens . vocab ) . from_bytes (
2019-03-10 21:16:45 +03:00
tokens . to_bytes ( exclude = [ " sentiment " ] ) , exclude = [ " sentiment " ]
2018-11-27 03:09:36 +03:00
)
2018-05-01 14:40:22 +03:00
assert tokens . text == new_tokens . text
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2015-08-06 01:35:40 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_set_ents ( en_tokenizer ) :
text = " I use goggle chrone to surf the web "
tokens = en_tokenizer ( text )
2015-08-06 01:35:40 +03:00
assert len ( tokens . ents ) == 0
2018-11-27 03:09:36 +03:00
tokens . ents = [ ( tokens . vocab . strings [ " PRODUCT " ] , 2 , 4 ) ]
2015-08-06 01:35:40 +03:00
assert len ( list ( tokens . ents ) ) == 1
2016-10-26 18:22:03 +03:00
assert [ t . ent_iob for t in tokens ] == [ 0 , 0 , 3 , 1 , 0 , 0 , 0 , 0 ]
2018-11-27 03:09:36 +03:00
assert tokens . ents [ 0 ] . label_ == " PRODUCT "
2017-01-11 20:05:36 +03:00
assert tokens . ents [ 0 ] . start == 2
assert tokens . ents [ 0 ] . end == 4
2015-10-18 09:17:27 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_sents_empty_string ( en_tokenizer ) :
doc = en_tokenizer ( " " )
2016-09-27 20:21:22 +03:00
doc . is_parsed = True
2016-09-27 19:49:14 +03:00
sents = list ( doc . sents )
assert len ( sents ) == 0
2017-01-11 20:05:36 +03:00
def test_doc_api_runtime_error ( en_tokenizer ) :
2016-01-25 17:22:42 +03:00
# Example that caused run-time error while parsing Reddit
2018-11-27 03:09:36 +03:00
# fmt: off
2017-01-11 20:05:36 +03:00
text = " 67 % o f black households are single parent \n \n 72 % o f all black babies born out of wedlock \n \n 50 % o f all black kids don \u2019 t finish high school "
2020-03-02 13:49:28 +03:00
deps = [ " nummod " , " nsubj " , " prep " , " amod " , " pobj " , " ROOT " , " amod " , " attr " , " " , " nummod " , " appos " , " prep " , " det " ,
" amod " , " pobj " , " acl " , " prep " , " prep " , " pobj " ,
" " , " nummod " , " nsubj " , " prep " , " det " , " amod " , " pobj " , " aux " , " neg " , " ccomp " , " amod " , " dobj " ]
2018-11-27 03:09:36 +03:00
# fmt: on
2017-01-11 20:05:36 +03:00
tokens = en_tokenizer ( text )
2018-07-25 00:38:44 +03:00
doc = get_doc ( tokens . vocab , words = [ t . text for t in tokens ] , deps = deps )
2016-01-25 17:22:42 +03:00
nps = [ ]
for np in doc . noun_chunks :
2018-11-27 03:09:36 +03:00
while len ( np ) > 1 and np [ 0 ] . dep_ not in ( " advmod " , " amod " , " compound " ) :
2016-01-25 17:22:42 +03:00
np = np [ 1 : ]
if len ( np ) > 1 :
2019-02-15 12:29:44 +03:00
nps . append ( np )
with doc . retokenize ( ) as retokenizer :
for np in nps :
attrs = {
" tag " : np . root . tag_ ,
" lemma " : np . text ,
" ent_type " : np . root . ent_type_ ,
}
retokenizer . merge ( np , attrs = attrs )
2016-02-07 01:47:51 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_right_edge ( en_tokenizer ) :
2017-01-14 15:41:19 +03:00
""" Test for bug occurring from Unshift action, causing incorrect right edge """
2018-11-27 03:09:36 +03:00
# fmt: off
2017-01-11 20:05:36 +03:00
text = " I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue. "
heads = [ 2 , 1 , 0 , - 1 , - 1 , - 3 , 15 , 1 , - 2 , - 1 , 1 , - 3 , - 1 , - 1 , 1 , - 2 , - 1 , 1 ,
- 2 , - 7 , 1 , - 19 , 1 , - 2 , - 3 , 2 , 1 , - 3 , - 26 ]
2018-11-27 03:09:36 +03:00
# fmt: on
2017-01-11 20:05:36 +03:00
tokens = en_tokenizer ( text )
2018-07-25 00:38:44 +03:00
doc = get_doc ( tokens . vocab , words = [ t . text for t in tokens ] , heads = heads )
2018-11-27 03:09:36 +03:00
assert doc [ 6 ] . text == " for "
2017-01-11 20:05:36 +03:00
subtree = [ w . text for w in doc [ 6 ] . subtree ]
2018-11-27 03:09:36 +03:00
assert subtree == [
" for " ,
" the " ,
" sake " ,
" of " ,
" such " ,
" as " ,
" live " ,
" under " ,
" the " ,
" government " ,
" of " ,
" the " ,
" Romans " ,
" , " ,
]
assert doc [ 6 ] . right_edge . text == " , "
2017-01-11 20:05:36 +03:00
2017-10-24 18:05:15 +03:00
def test_doc_api_has_vector ( ) :
vocab = Vocab ( )
2017-10-31 20:25:08 +03:00
vocab . reset_vectors ( width = 2 )
2018-11-27 03:09:36 +03:00
vocab . set_vector ( " kitten " , vector = numpy . asarray ( [ 0.0 , 2.0 ] , dtype = " f " ) )
doc = Doc ( vocab , words = [ " kitten " ] )
2016-05-09 13:36:14 +03:00
assert doc . has_vector
2016-10-16 21:20:23 +03:00
2018-01-15 18:29:48 +03:00
def test_doc_api_similarity_match ( ) :
2018-11-27 03:09:36 +03:00
doc = Doc ( Vocab ( ) , words = [ " a " ] )
2019-02-10 16:02:19 +03:00
assert doc . similarity ( doc [ 0 ] ) == 1.0
assert doc . similarity ( doc . vocab [ " a " ] ) == 1.0
2018-11-27 03:09:36 +03:00
doc2 = Doc ( doc . vocab , words = [ " a " , " b " , " c " ] )
2020-02-28 14:20:23 +03:00
with pytest . warns ( UserWarning ) :
2018-05-21 02:22:38 +03:00
assert doc . similarity ( doc2 [ : 1 ] ) == 1.0
assert doc . similarity ( doc2 ) == 0.0
2018-01-15 18:29:48 +03:00
2019-02-07 22:54:07 +03:00
@pytest.mark.parametrize (
" sentence,heads,lca_matrix " ,
[
(
" the lazy dog slept " ,
[ 2 , 1 , 1 , 0 ] ,
numpy . array ( [ [ 0 , 2 , 2 , 3 ] , [ 2 , 1 , 2 , 3 ] , [ 2 , 2 , 2 , 3 ] , [ 3 , 3 , 3 , 3 ] ] ) ,
) ,
(
" The lazy dog slept. The quick fox jumped " ,
[ 2 , 1 , 1 , 0 , - 1 , 2 , 1 , 1 , 0 ] ,
numpy . array (
[
[ 0 , 2 , 2 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 2 , 1 , 2 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 2 , 2 , 2 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 3 , 3 , 3 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 3 , 3 , 3 , 3 , 4 , - 1 , - 1 , - 1 , - 1 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 5 , 7 , 7 , 8 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 7 , 6 , 7 , 8 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 7 , 7 , 7 , 8 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 8 , 8 , 8 , 8 ] ,
]
) ,
) ,
] ,
)
2019-01-06 21:07:50 +03:00
def test_lowest_common_ancestor ( en_tokenizer , sentence , heads , lca_matrix ) :
tokens = en_tokenizer ( sentence )
doc = get_doc ( tokens . vocab , [ t . text for t in tokens ] , heads = heads )
2017-10-20 21:28:00 +03:00
lca = doc . get_lca_matrix ( )
2019-01-06 21:07:50 +03:00
assert ( lca == lca_matrix ) . all ( )
2018-11-27 03:09:36 +03:00
assert lca [ 1 , 1 ] == 1
assert lca [ 0 , 1 ] == 2
assert lca [ 1 , 2 ] == 2
2019-03-10 17:24:34 +03:00
def test_doc_is_nered ( en_vocab ) :
words = [ " I " , " live " , " in " , " New " , " York " ]
doc = Doc ( en_vocab , words = words )
assert not doc . is_nered
doc . ents = [ Span ( doc , 3 , 5 , label = " GPE " ) ]
assert doc . is_nered
# Test creating doc from array with unknown values
arr = numpy . array ( [ [ 0 , 0 ] , [ 0 , 0 ] , [ 0 , 0 ] , [ 384 , 3 ] , [ 384 , 1 ] ] , dtype = " uint64 " )
doc = Doc ( en_vocab , words = words ) . from_array ( [ ENT_TYPE , ENT_IOB ] , arr )
assert doc . is_nered
# Test serialization
new_doc = Doc ( en_vocab ) . from_bytes ( doc . to_bytes ( ) )
assert new_doc . is_nered
2019-03-11 16:21:40 +03:00
2020-02-16 19:17:09 +03:00
def test_doc_from_array_sent_starts ( en_vocab ) :
words = [ " I " , " live " , " in " , " New " , " York " , " . " , " I " , " like " , " cats " , " . " ]
heads = [ 0 , 0 , 0 , 0 , 0 , 0 , 6 , 6 , 6 , 6 ]
2020-03-02 13:49:28 +03:00
# fmt: off
2020-02-16 19:17:09 +03:00
deps = [ " ROOT " , " dep " , " dep " , " dep " , " dep " , " dep " , " ROOT " , " dep " , " dep " , " dep " , " dep " ]
2020-03-02 13:49:28 +03:00
# fmt: on
2020-02-16 19:17:09 +03:00
doc = Doc ( en_vocab , words = words )
for i , ( dep , head ) in enumerate ( zip ( deps , heads ) ) :
doc [ i ] . dep_ = dep
doc [ i ] . head = doc [ head ]
if head == i :
doc [ i ] . is_sent_start = True
doc . is_parsed
attrs = [ SENT_START , HEAD ]
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
with pytest . raises ( ValueError ) :
new_doc . from_array ( attrs , arr )
attrs = [ SENT_START , DEP ]
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
new_doc . from_array ( attrs , arr )
assert [ t . is_sent_start for t in doc ] == [ t . is_sent_start for t in new_doc ]
assert not new_doc . is_parsed
attrs = [ HEAD , DEP ]
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
new_doc . from_array ( attrs , arr )
assert [ t . is_sent_start for t in doc ] == [ t . is_sent_start for t in new_doc ]
assert new_doc . is_parsed
2019-03-11 16:21:40 +03:00
def test_doc_lang ( en_vocab ) :
doc = Doc ( en_vocab , words = [ " Hello " , " world " ] )
assert doc . lang_ == " en "
assert doc . lang == en_vocab . strings [ " en " ]