2017-01-11 20:05:36 +03:00
# coding: utf-8
2015-02-07 21:14:07 +03:00
from __future__ import unicode_literals
2017-01-11 20:05:36 +03:00
from . . util import get_doc
2017-10-24 18:05:15 +03:00
from . . . tokens import Doc
from . . . vocab import Vocab
2018-05-20 16:15:37 +03:00
from . . . attrs import LEMMA
2015-07-13 19:39:38 +03:00
2015-02-07 21:14:07 +03:00
import pytest
2017-01-11 20:05:36 +03:00
import numpy
2015-02-07 21:14:07 +03:00
2017-01-11 20:05:36 +03:00
@pytest.mark.parametrize ( ' text ' , [ [ " one " , " two " , " three " ] ] )
def test_doc_api_compare_by_string_position ( en_vocab , text ) :
doc = get_doc ( en_vocab , text )
2017-01-09 21:12:00 +03:00
# Get the tokens in this order, so their ID ordering doesn't match the idx
2017-01-11 20:05:36 +03:00
token3 = doc [ - 1 ]
token2 = doc [ - 2 ]
token1 = doc [ - 1 ]
token1 , token2 , token3 = doc
assert token1 < token2 < token3
assert not token1 > token2
assert token2 > token1
assert token2 < = token3
assert token3 > = token1
def test_doc_api_getitem ( en_tokenizer ) :
text = " Give it back! He pleaded. "
tokens = en_tokenizer ( text )
assert tokens [ 0 ] . text == ' Give '
assert tokens [ - 1 ] . text == ' . '
2015-02-07 21:14:07 +03:00
with pytest . raises ( IndexError ) :
tokens [ len ( tokens ) ]
2015-07-13 19:39:38 +03:00
2015-10-06 11:59:11 +03:00
def to_str ( span ) :
2017-01-14 15:41:19 +03:00
return ' / ' . join ( token . text for token in span )
2015-10-06 11:59:11 +03:00
2015-10-06 10:51:25 +03:00
span = tokens [ 1 : 1 ]
2015-10-06 11:59:11 +03:00
assert not to_str ( span )
2015-10-06 10:51:25 +03:00
span = tokens [ 1 : 4 ]
2015-10-06 11:59:11 +03:00
assert to_str ( span ) == ' it/back/! '
2015-10-06 10:56:33 +03:00
span = tokens [ 1 : 4 : 1 ]
2015-10-06 11:59:11 +03:00
assert to_str ( span ) == ' it/back/! '
2015-10-06 10:51:25 +03:00
with pytest . raises ( ValueError ) :
tokens [ 1 : 4 : 2 ]
with pytest . raises ( ValueError ) :
tokens [ 1 : 4 : - 1 ]
2015-10-06 11:59:11 +03:00
span = tokens [ - 3 : 6 ]
assert to_str ( span ) == ' He/pleaded '
span = tokens [ 4 : - 1 ]
assert to_str ( span ) == ' He/pleaded '
span = tokens [ - 5 : - 3 ]
assert to_str ( span ) == ' back/! '
span = tokens [ 5 : 4 ]
assert span . start == span . end == 5 and not to_str ( span )
span = tokens [ 4 : - 3 ]
assert span . start == span . end == 4 and not to_str ( span )
span = tokens [ : ]
assert to_str ( span ) == ' Give/it/back/!/He/pleaded/. '
span = tokens [ 4 : ]
assert to_str ( span ) == ' He/pleaded/. '
span = tokens [ : 4 ]
assert to_str ( span ) == ' Give/it/back/! '
span = tokens [ : - 3 ]
assert to_str ( span ) == ' Give/it/back/! '
span = tokens [ - 3 : ]
assert to_str ( span ) == ' He/pleaded/. '
span = tokens [ 4 : 50 ]
assert to_str ( span ) == ' He/pleaded/. '
span = tokens [ - 50 : 4 ]
assert to_str ( span ) == ' Give/it/back/! '
span = tokens [ - 50 : - 40 ]
assert span . start == span . end == 0 and not to_str ( span )
span = tokens [ 40 : 50 ]
assert span . start == span . end == 7 and not to_str ( span )
2015-10-06 12:08:39 +03:00
span = tokens [ 1 : 4 ]
assert span [ 0 ] . orth_ == ' it '
2015-10-06 12:45:49 +03:00
subspan = span [ : ]
assert to_str ( subspan ) == ' it/back/! '
subspan = span [ : 2 ]
assert to_str ( subspan ) == ' it/back '
subspan = span [ 1 : ]
assert to_str ( subspan ) == ' back/! '
subspan = span [ : - 1 ]
assert to_str ( subspan ) == ' it/back '
subspan = span [ - 2 : ]
assert to_str ( subspan ) == ' back/! '
subspan = span [ 1 : 2 ]
assert to_str ( subspan ) == ' back '
subspan = span [ - 2 : - 1 ]
assert to_str ( subspan ) == ' back '
subspan = span [ - 50 : 50 ]
assert to_str ( subspan ) == ' it/back/! '
subspan = span [ 50 : - 50 ]
assert subspan . start == subspan . end == 4 and not to_str ( subspan )
2015-10-06 12:08:39 +03:00
2015-07-13 19:39:38 +03:00
2017-01-11 20:05:36 +03:00
@pytest.mark.parametrize ( ' text ' , [ " Give it back! He pleaded. " ,
" Give it back! He pleaded. " ] )
def test_doc_api_serialize ( en_tokenizer , text ) :
tokens = en_tokenizer ( text )
new_tokens = get_doc ( tokens . vocab ) . from_bytes ( tokens . to_bytes ( ) )
2017-05-31 00:34:23 +03:00
assert tokens . text == new_tokens . text
2017-01-11 20:05:36 +03:00
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
2015-07-13 19:39:38 +03:00
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2015-08-06 01:35:40 +03:00
2018-05-01 14:40:22 +03:00
new_tokens = get_doc ( tokens . vocab ) . from_bytes (
tokens . to_bytes ( tensor = False ) , tensor = False )
assert tokens . text == new_tokens . text
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
new_tokens = get_doc ( tokens . vocab ) . from_bytes (
tokens . to_bytes ( sentiment = False ) , sentiment = False )
assert tokens . text == new_tokens . text
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2015-08-06 01:35:40 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_set_ents ( en_tokenizer ) :
text = " I use goggle chrone to surf the web "
tokens = en_tokenizer ( text )
2015-08-06 01:35:40 +03:00
assert len ( tokens . ents ) == 0
2017-01-11 20:05:36 +03:00
tokens . ents = [ ( tokens . vocab . strings [ ' PRODUCT ' ] , 2 , 4 ) ]
2015-08-06 01:35:40 +03:00
assert len ( list ( tokens . ents ) ) == 1
2016-10-26 18:22:03 +03:00
assert [ t . ent_iob for t in tokens ] == [ 0 , 0 , 3 , 1 , 0 , 0 , 0 , 0 ]
2017-01-11 20:05:36 +03:00
assert tokens . ents [ 0 ] . label_ == ' PRODUCT '
assert tokens . ents [ 0 ] . start == 2
assert tokens . ents [ 0 ] . end == 4
2015-10-18 09:17:27 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_merge ( en_tokenizer ) :
text = " WKRO played songs by the beach boys all night "
2015-10-18 09:17:27 +03:00
# merge 'The Beach Boys'
2017-01-11 20:05:36 +03:00
doc = en_tokenizer ( text )
assert len ( doc ) == 9
2017-11-01 18:49:11 +03:00
doc . merge ( doc [ 4 ] . idx , doc [ 6 ] . idx + len ( doc [ 6 ] ) , tag = ' NAMED ' , lemma = ' LEMMA ' ,
ent_type = ' TYPE ' )
2015-10-18 09:17:27 +03:00
assert len ( doc ) == 7
assert doc [ 4 ] . text == ' the beach boys '
assert doc [ 4 ] . text_with_ws == ' the beach boys '
assert doc [ 4 ] . tag_ == ' NAMED '
2017-01-11 20:05:36 +03:00
# merge 'all night'
doc = en_tokenizer ( text )
2015-10-19 07:47:04 +03:00
assert len ( doc ) == 9
2017-11-01 18:49:11 +03:00
doc . merge ( doc [ 7 ] . idx , doc [ 8 ] . idx + len ( doc [ 8 ] ) , tag = ' NAMED ' , lemma = ' LEMMA ' ,
ent_type = ' TYPE ' )
2015-10-19 07:47:04 +03:00
assert len ( doc ) == 8
assert doc [ 7 ] . text == ' all night '
assert doc [ 7 ] . text_with_ws == ' all night '
2017-01-11 20:05:36 +03:00
def test_doc_api_merge_children ( en_tokenizer ) :
2015-10-18 09:17:27 +03:00
""" Test that attachments work correctly after merging. """
2017-01-11 20:05:36 +03:00
text = " WKRO played songs by the beach boys all night "
doc = en_tokenizer ( text )
assert len ( doc ) == 9
2017-11-01 18:49:11 +03:00
doc . merge ( doc [ 4 ] . idx , doc [ 6 ] . idx + len ( doc [ 6 ] ) , tag = ' NAMED ' , lemma = ' LEMMA ' ,
ent_type = ' TYPE ' )
2017-01-11 20:05:36 +03:00
2015-10-18 09:17:27 +03:00
for word in doc :
if word . i < word . head . i :
assert word in list ( word . head . lefts )
elif word . i > word . head . i :
assert word in list ( word . head . rights )
2016-01-16 20:00:26 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_merge_hang ( en_tokenizer ) :
text = " through North and South Carolina "
doc = en_tokenizer ( text )
2017-11-01 18:49:11 +03:00
doc . merge ( 18 , 32 , tag = ' ' , lemma = ' ' , ent_type = ' ORG ' )
doc . merge ( 8 , 32 , tag = ' ' , lemma = ' ' , ent_type = ' ORG ' )
2016-01-25 17:22:42 +03:00
2018-05-20 16:15:37 +03:00
def test_doc_api_retokenizer ( en_tokenizer ) :
doc = en_tokenizer ( " WKRO played songs by the beach boys all night " )
with doc . retokenize ( ) as retokenizer :
retokenizer . merge ( doc [ 4 : 7 ] )
assert len ( doc ) == 7
assert doc [ 4 ] . text == ' the beach boys '
def test_doc_api_retokenizer_attrs ( en_tokenizer ) :
doc = en_tokenizer ( " WKRO played songs by the beach boys all night " )
# test both string and integer attributes and values
attrs = { LEMMA : ' boys ' , ' ENT_TYPE ' : doc . vocab . strings [ ' ORG ' ] }
with doc . retokenize ( ) as retokenizer :
retokenizer . merge ( doc [ 4 : 7 ] , attrs = attrs )
assert len ( doc ) == 7
assert doc [ 4 ] . text == ' the beach boys '
assert doc [ 4 ] . lemma_ == ' boys '
assert doc [ 4 ] . ent_type_ == ' ORG '
2017-01-11 20:05:36 +03:00
def test_doc_api_sents_empty_string ( en_tokenizer ) :
doc = en_tokenizer ( " " )
2016-09-27 20:21:22 +03:00
doc . is_parsed = True
2016-09-27 19:49:14 +03:00
sents = list ( doc . sents )
assert len ( sents ) == 0
2017-01-11 20:05:36 +03:00
def test_doc_api_runtime_error ( en_tokenizer ) :
2016-01-25 17:22:42 +03:00
# Example that caused run-time error while parsing Reddit
2017-01-11 20:05:36 +03:00
text = " 67 % o f black households are single parent \n \n 72 % o f all black babies born out of wedlock \n \n 50 % o f all black kids don \u2019 t finish high school "
deps = [ ' nsubj ' , ' prep ' , ' amod ' , ' pobj ' , ' ROOT ' , ' amod ' , ' attr ' , ' ' ,
' nummod ' , ' prep ' , ' det ' , ' amod ' , ' pobj ' , ' acl ' , ' prep ' , ' prep ' ,
' pobj ' , ' ' , ' nummod ' , ' prep ' , ' det ' , ' amod ' , ' pobj ' , ' aux ' , ' neg ' ,
' ROOT ' , ' amod ' , ' dobj ' ]
tokens = en_tokenizer ( text )
doc = get_doc ( tokens . vocab , [ t . text for t in tokens ] , deps = deps )
2016-01-25 17:22:42 +03:00
nps = [ ]
for np in doc . noun_chunks :
while len ( np ) > 1 and np [ 0 ] . dep_ not in ( ' advmod ' , ' amod ' , ' compound ' ) :
np = np [ 1 : ]
if len ( np ) > 1 :
nps . append ( ( np . start_char , np . end_char , np . root . tag_ , np . text , np . root . ent_type_ ) )
for np in nps :
2017-11-01 18:49:11 +03:00
start , end , tag , lemma , ent_type = np
doc . merge ( start , end , tag = tag , lemma = lemma , ent_type = ent_type )
2016-02-07 01:47:51 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_right_edge ( en_tokenizer ) :
2017-01-14 15:41:19 +03:00
""" Test for bug occurring from Unshift action, causing incorrect right edge """
2017-01-11 20:05:36 +03:00
text = " I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue. "
heads = [ 2 , 1 , 0 , - 1 , - 1 , - 3 , 15 , 1 , - 2 , - 1 , 1 , - 3 , - 1 , - 1 , 1 , - 2 , - 1 , 1 ,
- 2 , - 7 , 1 , - 19 , 1 , - 2 , - 3 , 2 , 1 , - 3 , - 26 ]
tokens = en_tokenizer ( text )
doc = get_doc ( tokens . vocab , [ t . text for t in tokens ] , heads = heads )
assert doc [ 6 ] . text == ' for '
subtree = [ w . text for w in doc [ 6 ] . subtree ]
2017-01-14 15:41:19 +03:00
assert subtree == [ ' for ' , ' the ' , ' sake ' , ' of ' , ' such ' , ' as ' ,
' live ' , ' under ' , ' the ' , ' government ' , ' of ' , ' the ' , ' Romans ' , ' , ' ]
2017-01-11 20:05:36 +03:00
assert doc [ 6 ] . right_edge . text == ' , '
2017-10-24 18:05:15 +03:00
def test_doc_api_has_vector ( ) :
vocab = Vocab ( )
2017-10-31 20:25:08 +03:00
vocab . reset_vectors ( width = 2 )
vocab . set_vector ( ' kitten ' , vector = numpy . asarray ( [ 0. , 2. ] , dtype = ' f ' ) )
2017-10-24 18:05:15 +03:00
doc = Doc ( vocab , words = [ ' kitten ' ] )
2016-05-09 13:36:14 +03:00
assert doc . has_vector
2016-10-16 21:20:23 +03:00
2018-01-15 18:29:48 +03:00
def test_doc_api_similarity_match ( ) :
doc = Doc ( Vocab ( ) , words = [ ' a ' ] )
2018-05-21 02:22:38 +03:00
with pytest . warns ( None ) :
assert doc . similarity ( doc [ 0 ] ) == 1.0
assert doc . similarity ( doc . vocab [ ' a ' ] ) == 1.0
2018-01-15 18:29:48 +03:00
doc2 = Doc ( doc . vocab , words = [ ' a ' , ' b ' , ' c ' ] )
2018-05-21 02:22:38 +03:00
with pytest . warns ( None ) :
assert doc . similarity ( doc2 [ : 1 ] ) == 1.0
assert doc . similarity ( doc2 ) == 0.0
2018-01-15 18:29:48 +03:00
2017-10-20 21:28:00 +03:00
def test_lowest_common_ancestor ( en_tokenizer ) :
tokens = en_tokenizer ( ' the lazy dog slept ' )
doc = get_doc ( tokens . vocab , [ t . text for t in tokens ] , heads = [ 2 , 1 , 1 , 0 ] )
lca = doc . get_lca_matrix ( )
assert ( lca [ 1 , 1 ] == 1 )
assert ( lca [ 0 , 1 ] == 2 )
assert ( lca [ 1 , 2 ] == 2 )
2016-10-16 21:20:23 +03:00
2018-01-15 18:29:48 +03:00
2017-05-13 13:32:45 +03:00
def test_parse_tree ( en_tokenizer ) :
""" Tests doc.print_tree() method. """
2016-10-16 22:12:08 +03:00
text = ' I like New York in Autumn. '
2017-05-13 13:32:45 +03:00
heads = [ 1 , 0 , 1 , - 2 , - 3 , - 1 , - 5 ]
tags = [ ' PRP ' , ' IN ' , ' NNP ' , ' NNP ' , ' IN ' , ' NNP ' , ' . ' ]
tokens = en_tokenizer ( text )
doc = get_doc ( tokens . vocab , [ t . text for t in tokens ] , heads = heads , tags = tags )
2016-10-16 22:12:08 +03:00
# full method parse_tree(text) is a trivial composition
2016-12-30 20:19:18 +03:00
trees = doc . print_tree ( )
2016-10-16 21:20:23 +03:00
assert len ( trees ) > 0
tree = trees [ 0 ]
assert all ( k in list ( tree . keys ( ) ) for k in [ ' word ' , ' lemma ' , ' NE ' , ' POS_fine ' , ' POS_coarse ' , ' arc ' , ' modifiers ' ] )
2016-10-16 22:12:08 +03:00
assert tree [ ' word ' ] == ' like ' # check root is correct