2021-02-28 04:32:48 +03:00
import weakref
2017-01-11 20:05:36 +03:00
import numpy
2022-04-25 19:19:03 +03:00
from numpy . testing import assert_array_equal
2021-12-04 22:34:48 +03:00
import pytest
2022-08-22 13:04:30 +03:00
import warnings
2021-12-04 22:34:48 +03:00
from thinc . api import NumpyOps , get_current_ops
2021-01-17 14:56:05 +03:00
2021-12-04 22:34:48 +03:00
from spacy . attrs import DEP , ENT_IOB , ENT_TYPE , HEAD , IS_ALPHA , MORPH , POS
from spacy . attrs import SENT_START , TAG
from spacy . lang . en import English
2023-01-31 19:30:43 +03:00
from spacy . lang . mul import MultiLanguage
2021-12-04 22:34:48 +03:00
from spacy . language import Language
from spacy . lexeme import Lexeme
2022-05-12 11:06:25 +03:00
from spacy . tokens import Doc , Span , SpanGroup , Token
2018-07-25 00:38:44 +03:00
from spacy . vocab import Vocab
2021-03-30 10:49:12 +03:00
from . test_underscore import clean_underscore # noqa: F401
2015-02-07 21:14:07 +03:00
2020-09-21 18:59:09 +03:00
def test_doc_api_init ( en_vocab ) :
2020-09-21 21:43:54 +03:00
words = [ " a " , " b " , " c " , " d " ]
heads = [ 0 , 0 , 2 , 2 ]
2020-09-21 18:59:09 +03:00
# set sent_start by sent_starts
2020-09-21 21:43:54 +03:00
doc = Doc ( en_vocab , words = words , sent_starts = [ True , False , True , False ] )
2020-09-21 18:59:09 +03:00
assert [ t . is_sent_start for t in doc ] == [ True , False , True , False ]
# set sent_start by heads
2020-09-21 21:43:54 +03:00
doc = Doc ( en_vocab , words = words , heads = heads , deps = [ " dep " ] * 4 )
2020-09-21 18:59:09 +03:00
assert [ t . is_sent_start for t in doc ] == [ True , False , True , False ]
# heads override sent_starts
doc = Doc (
2020-09-29 22:39:28 +03:00
en_vocab , words = words , sent_starts = [ True ] * 4 , heads = heads , deps = [ " dep " ] * 4
2020-09-21 18:59:09 +03:00
)
assert [ t . is_sent_start for t in doc ] == [ True , False , True , False ]
2017-01-11 20:05:36 +03:00
2015-02-07 21:14:07 +03:00
2021-12-04 22:34:48 +03:00
@pytest.mark.issue ( 1547 )
def test_issue1547 ( ) :
""" Test that entity labels still match after merging tokens. """
words = [ " \n " , " worda " , " . " , " \n " , " wordb " , " - " , " Biosphere " , " 2 " , " - " , " \n " ]
doc = Doc ( Vocab ( ) , words = words )
doc . ents = [ Span ( doc , 6 , 8 , label = doc . vocab . strings [ " PRODUCT " ] ) ]
with doc . retokenize ( ) as retokenizer :
retokenizer . merge ( doc [ 5 : 7 ] )
assert [ ent . text for ent in doc . ents ]
@pytest.mark.issue ( 1757 )
def test_issue1757 ( ) :
""" Test comparison against None doesn ' t cause segfault. """
doc = Doc ( Vocab ( ) , words = [ " a " , " b " , " c " ] )
assert not doc [ 0 ] < None
assert not doc [ 0 ] is None
assert doc [ 0 ] > = None
assert not doc [ : 2 ] < None
assert not doc [ : 2 ] is None
assert doc [ : 2 ] > = None
assert not doc . vocab [ " a " ] is None
assert not doc . vocab [ " a " ] < None
@pytest.mark.issue ( 2396 )
def test_issue2396 ( en_vocab ) :
words = [ " She " , " created " , " a " , " test " , " for " , " spacy " ]
heads = [ 1 , 1 , 3 , 1 , 3 , 4 ]
deps = [ " dep " ] * len ( heads )
matrix = numpy . array (
[
[ 0 , 1 , 1 , 1 , 1 , 1 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 ] ,
[ 1 , 1 , 2 , 3 , 3 , 3 ] ,
[ 1 , 1 , 3 , 3 , 3 , 3 ] ,
[ 1 , 1 , 3 , 3 , 4 , 4 ] ,
[ 1 , 1 , 3 , 3 , 4 , 5 ] ,
] ,
dtype = numpy . int32 ,
)
doc = Doc ( en_vocab , words = words , heads = heads , deps = deps )
span = doc [ : ]
assert ( doc . get_lca_matrix ( ) == matrix ) . all ( )
assert ( span . get_lca_matrix ( ) == matrix ) . all ( )
2022-09-26 16:58:21 +03:00
@pytest.mark.issue ( 11499 )
def test_init_args_unmodified ( en_vocab ) :
words = [ " A " , " sentence " ]
ents = [ " B-TYPE1 " , " " ]
sent_starts = [ True , False ]
Doc (
vocab = en_vocab ,
words = words ,
ents = ents ,
sent_starts = sent_starts ,
)
assert ents == [ " B-TYPE1 " , " " ]
assert sent_starts == [ True , False ]
2021-12-04 22:34:48 +03:00
@pytest.mark.parametrize ( " text " , [ " -0.23 " , " +123,456 " , " ±1 " ] )
@pytest.mark.parametrize ( " lang_cls " , [ English , MultiLanguage ] )
@pytest.mark.issue ( 2782 )
def test_issue2782 ( text , lang_cls ) :
""" Check that like_num handles + and - before number. """
nlp = lang_cls ( )
doc = nlp ( text )
assert len ( doc ) == 1
assert doc [ 0 ] . like_num
@pytest.mark.parametrize (
" sentence " ,
[
" The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction. " ,
" The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale ' s #1. " ,
" The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale ' s number one " ,
" Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions. " ,
" It was a missed assignment, but it shouldn ' t have resulted in a turnover ... " ,
] ,
)
@pytest.mark.issue ( 3869 )
def test_issue3869 ( sentence ) :
""" Test that the Doc ' s count_by function works consistently """
nlp = English ( )
doc = nlp ( sentence )
count = 0
for token in doc :
count + = token . is_alpha
assert count == doc . count_by ( IS_ALPHA ) . get ( 1 , 0 )
@pytest.mark.issue ( 3962 )
def test_issue3962 ( en_vocab ) :
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise . """
# fmt: off
words = [ " He " , " jests " , " at " , " scars " , " , " , " that " , " never " , " felt " , " a " , " wound " , " . " ]
heads = [ 1 , 7 , 1 , 2 , 7 , 7 , 7 , 7 , 9 , 7 , 7 ]
deps = [ " nsubj " , " ccomp " , " prep " , " pobj " , " punct " , " nsubj " , " neg " , " ROOT " , " det " , " dobj " , " punct " ]
# fmt: on
doc = Doc ( en_vocab , words = words , heads = heads , deps = deps )
span2 = doc [ 1 : 5 ] # "jests at scars ,"
doc2 = span2 . as_doc ( )
doc2_json = doc2 . to_json ( )
assert doc2_json
# head set to itself, being the new artificial root
assert doc2 [ 0 ] . head . text == " jests "
assert doc2 [ 0 ] . dep_ == " dep "
assert doc2 [ 1 ] . head . text == " jests "
assert doc2 [ 1 ] . dep_ == " prep "
assert doc2 [ 2 ] . head . text == " at "
assert doc2 [ 2 ] . dep_ == " pobj "
assert doc2 [ 3 ] . head . text == " jests " # head set to the new artificial root
assert doc2 [ 3 ] . dep_ == " dep "
# We should still have 1 sentence
assert len ( list ( doc2 . sents ) ) == 1
span3 = doc [ 6 : 9 ] # "never felt a"
doc3 = span3 . as_doc ( )
doc3_json = doc3 . to_json ( )
assert doc3_json
assert doc3 [ 0 ] . head . text == " felt "
assert doc3 [ 0 ] . dep_ == " neg "
assert doc3 [ 1 ] . head . text == " felt "
assert doc3 [ 1 ] . dep_ == " ROOT "
assert doc3 [ 2 ] . head . text == " felt " # head set to ancestor
assert doc3 [ 2 ] . dep_ == " dep "
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
assert len ( list ( doc3 . sents ) ) == 1
@pytest.mark.issue ( 3962 )
def test_issue3962_long ( en_vocab ) :
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise . """
# fmt: off
words = [ " He " , " jests " , " at " , " scars " , " . " , " They " , " never " , " felt " , " a " , " wound " , " . " ]
heads = [ 1 , 1 , 1 , 2 , 1 , 7 , 7 , 7 , 9 , 7 , 7 ]
deps = [ " nsubj " , " ROOT " , " prep " , " pobj " , " punct " , " nsubj " , " neg " , " ROOT " , " det " , " dobj " , " punct " ]
# fmt: on
two_sent_doc = Doc ( en_vocab , words = words , heads = heads , deps = deps )
span2 = two_sent_doc [ 1 : 7 ] # "jests at scars. They never"
doc2 = span2 . as_doc ( )
doc2_json = doc2 . to_json ( )
assert doc2_json
# head set to itself, being the new artificial root (in sentence 1)
assert doc2 [ 0 ] . head . text == " jests "
assert doc2 [ 0 ] . dep_ == " ROOT "
assert doc2 [ 1 ] . head . text == " jests "
assert doc2 [ 1 ] . dep_ == " prep "
assert doc2 [ 2 ] . head . text == " at "
assert doc2 [ 2 ] . dep_ == " pobj "
assert doc2 [ 3 ] . head . text == " jests "
assert doc2 [ 3 ] . dep_ == " punct "
# head set to itself, being the new artificial root (in sentence 2)
assert doc2 [ 4 ] . head . text == " They "
assert doc2 [ 4 ] . dep_ == " dep "
# head set to the new artificial head (in sentence 2)
assert doc2 [ 4 ] . head . text == " They "
assert doc2 [ 4 ] . dep_ == " dep "
# We should still have 2 sentences
sents = list ( doc2 . sents )
assert len ( sents ) == 2
assert sents [ 0 ] . text == " jests at scars . "
assert sents [ 1 ] . text == " They never "
@Language.factory ( " my_pipe " )
class CustomPipe :
def __init__ ( self , nlp , name = " my_pipe " ) :
self . name = name
Span . set_extension ( " my_ext " , getter = self . _get_my_ext )
Doc . set_extension ( " my_ext " , default = None )
def __call__ ( self , doc ) :
gathered_ext = [ ]
for sent in doc . sents :
sent_ext = self . _get_my_ext ( sent )
sent . _ . set ( " my_ext " , sent_ext )
gathered_ext . append ( sent_ext )
doc . _ . set ( " my_ext " , " \n " . join ( gathered_ext ) )
return doc
@staticmethod
def _get_my_ext ( span ) :
return str ( span . end )
@pytest.mark.issue ( 4903 )
def test_issue4903 ( ) :
""" Ensure that this runs correctly and doesn ' t hang or crash on Windows /
macOS . """
nlp = English ( )
nlp . add_pipe ( " sentencizer " )
nlp . add_pipe ( " my_pipe " , after = " sentencizer " )
text = [ " I like bananas. " , " Do you like them? " , " No, I prefer wasabi. " ]
if isinstance ( get_current_ops ( ) , NumpyOps ) :
docs = list ( nlp . pipe ( text , n_process = 2 ) )
assert docs [ 0 ] . text == " I like bananas. "
assert docs [ 1 ] . text == " Do you like them? "
assert docs [ 2 ] . text == " No, I prefer wasabi. "
@pytest.mark.issue ( 5048 )
def test_issue5048 ( en_vocab ) :
words = [ " This " , " is " , " a " , " sentence " ]
pos_s = [ " DET " , " VERB " , " DET " , " NOUN " ]
spaces = [ " " , " " , " " , " " ]
deps_s = [ " dep " , " adj " , " nn " , " atm " ]
tags_s = [ " DT " , " VBZ " , " DT " , " NN " ]
strings = en_vocab . strings
for w in words :
strings . add ( w )
deps = [ strings . add ( d ) for d in deps_s ]
pos = [ strings . add ( p ) for p in pos_s ]
tags = [ strings . add ( t ) for t in tags_s ]
attrs = [ POS , DEP , TAG ]
array = numpy . array ( list ( zip ( pos , deps , tags ) ) , dtype = " uint64 " )
doc = Doc ( en_vocab , words = words , spaces = spaces )
doc . from_array ( attrs , array )
v1 = [ ( token . text , token . pos_ , token . tag_ ) for token in doc ]
doc2 = Doc ( en_vocab , words = words , pos = pos_s , deps = deps_s , tags = tags_s )
v2 = [ ( token . text , token . pos_ , token . tag_ ) for token in doc2 ]
assert v1 == v2
2018-11-27 03:09:36 +03:00
@pytest.mark.parametrize ( " text " , [ [ " one " , " two " , " three " ] ] )
2017-01-11 20:05:36 +03:00
def test_doc_api_compare_by_string_position ( en_vocab , text ) :
2018-07-25 00:38:44 +03:00
doc = Doc ( en_vocab , words = text )
2017-01-09 21:12:00 +03:00
# Get the tokens in this order, so their ID ordering doesn't match the idx
2017-01-11 20:05:36 +03:00
token3 = doc [ - 1 ]
token2 = doc [ - 2 ]
token1 = doc [ - 1 ]
token1 , token2 , token3 = doc
assert token1 < token2 < token3
assert not token1 > token2
assert token2 > token1
assert token2 < = token3
assert token3 > = token1
def test_doc_api_getitem ( en_tokenizer ) :
text = " Give it back! He pleaded. "
tokens = en_tokenizer ( text )
2018-11-27 03:09:36 +03:00
assert tokens [ 0 ] . text == " Give "
assert tokens [ - 1 ] . text == " . "
2015-02-07 21:14:07 +03:00
with pytest . raises ( IndexError ) :
tokens [ len ( tokens ) ]
2015-07-13 19:39:38 +03:00
2015-10-06 11:59:11 +03:00
def to_str ( span ) :
2018-11-27 03:09:36 +03:00
return " / " . join ( token . text for token in span )
2015-10-06 11:59:11 +03:00
2015-10-06 10:51:25 +03:00
span = tokens [ 1 : 1 ]
2015-10-06 11:59:11 +03:00
assert not to_str ( span )
2015-10-06 10:51:25 +03:00
span = tokens [ 1 : 4 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " it/back/! "
2015-10-06 10:56:33 +03:00
span = tokens [ 1 : 4 : 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " it/back/! "
2015-10-06 10:51:25 +03:00
with pytest . raises ( ValueError ) :
tokens [ 1 : 4 : 2 ]
with pytest . raises ( ValueError ) :
tokens [ 1 : 4 : - 1 ]
2015-10-06 11:59:11 +03:00
span = tokens [ - 3 : 6 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded "
2015-10-06 11:59:11 +03:00
span = tokens [ 4 : - 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded "
2015-10-06 11:59:11 +03:00
span = tokens [ - 5 : - 3 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ 5 : 4 ]
assert span . start == span . end == 5 and not to_str ( span )
span = tokens [ 4 : - 3 ]
assert span . start == span . end == 4 and not to_str ( span )
span = tokens [ : ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/!/He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ 4 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ : 4 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ : - 3 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ - 3 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ 4 : 50 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ - 50 : 4 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ - 50 : - 40 ]
assert span . start == span . end == 0 and not to_str ( span )
span = tokens [ 40 : 50 ]
assert span . start == span . end == 7 and not to_str ( span )
2015-10-06 12:08:39 +03:00
span = tokens [ 1 : 4 ]
2018-11-27 03:09:36 +03:00
assert span [ 0 ] . orth_ == " it "
2015-10-06 12:45:49 +03:00
subspan = span [ : ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ : 2 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back "
2015-10-06 12:45:49 +03:00
subspan = span [ 1 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ : - 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back "
2015-10-06 12:45:49 +03:00
subspan = span [ - 2 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ 1 : 2 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back "
2015-10-06 12:45:49 +03:00
subspan = span [ - 2 : - 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back "
2015-10-06 12:45:49 +03:00
subspan = span [ - 50 : 50 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ 50 : - 50 ]
assert subspan . start == subspan . end == 4 and not to_str ( subspan )
2015-10-06 12:08:39 +03:00
2015-07-13 19:39:38 +03:00
2018-11-27 03:09:36 +03:00
@pytest.mark.parametrize (
" text " , [ " Give it back! He pleaded. " , " Give it back! He pleaded. " ]
)
2017-01-11 20:05:36 +03:00
def test_doc_api_serialize ( en_tokenizer , text ) :
tokens = en_tokenizer ( text )
2020-07-02 18:11:57 +03:00
tokens [ 0 ] . lemma_ = " lemma "
tokens [ 0 ] . norm_ = " norm "
2020-09-17 01:14:01 +03:00
tokens . ents = [ ( tokens . vocab . strings [ " PRODUCT " ] , 0 , 1 ) ]
2020-07-02 18:11:57 +03:00
tokens [ 0 ] . ent_kb_id_ = " ent_kb_id "
2020-11-10 15:16:07 +03:00
tokens [ 0 ] . ent_id_ = " ent_id "
2018-07-25 00:38:44 +03:00
new_tokens = Doc ( tokens . vocab ) . from_bytes ( tokens . to_bytes ( ) )
2017-05-31 00:34:23 +03:00
assert tokens . text == new_tokens . text
2017-01-11 20:05:36 +03:00
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
2015-07-13 19:39:38 +03:00
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2020-07-02 18:11:57 +03:00
assert new_tokens [ 0 ] . lemma_ == " lemma "
assert new_tokens [ 0 ] . norm_ == " norm "
assert new_tokens [ 0 ] . ent_kb_id_ == " ent_kb_id "
2020-11-10 15:16:07 +03:00
assert new_tokens [ 0 ] . ent_id_ == " ent_id "
2015-08-06 01:35:40 +03:00
2018-07-25 00:38:44 +03:00
new_tokens = Doc ( tokens . vocab ) . from_bytes (
2019-03-10 21:16:45 +03:00
tokens . to_bytes ( exclude = [ " tensor " ] ) , exclude = [ " tensor " ]
2018-11-27 03:09:36 +03:00
)
2018-05-01 14:40:22 +03:00
assert tokens . text == new_tokens . text
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2022-11-23 15:09:32 +03:00
new_tokens = Doc ( tokens . vocab ) . from_bytes ( tokens . to_bytes ( ) )
2018-05-01 14:40:22 +03:00
assert tokens . text == new_tokens . text
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2020-12-29 13:54:32 +03:00
def inner_func ( d1 , d2 ) :
return " hello! "
2021-06-04 18:44:04 +03:00
_ = tokens . to_bytes ( ) # noqa: F841
with pytest . warns ( UserWarning ) :
2020-12-29 13:54:32 +03:00
tokens . user_hooks [ " similarity " ] = inner_func
2021-01-05 05:41:53 +03:00
_ = tokens . to_bytes ( ) # noqa: F841
2020-12-29 13:54:32 +03:00
2015-08-06 01:35:40 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_set_ents ( en_tokenizer ) :
text = " I use goggle chrone to surf the web "
tokens = en_tokenizer ( text )
2015-08-06 01:35:40 +03:00
assert len ( tokens . ents ) == 0
2018-11-27 03:09:36 +03:00
tokens . ents = [ ( tokens . vocab . strings [ " PRODUCT " ] , 2 , 4 ) ]
2015-08-06 01:35:40 +03:00
assert len ( list ( tokens . ents ) ) == 1
2020-09-17 22:10:41 +03:00
assert [ t . ent_iob for t in tokens ] == [ 2 , 2 , 3 , 1 , 2 , 2 , 2 , 2 ]
2018-11-27 03:09:36 +03:00
assert tokens . ents [ 0 ] . label_ == " PRODUCT "
2017-01-11 20:05:36 +03:00
assert tokens . ents [ 0 ] . start == 2
assert tokens . ents [ 0 ] . end == 4
2015-10-18 09:17:27 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_sents_empty_string ( en_tokenizer ) :
doc = en_tokenizer ( " " )
2016-09-27 19:49:14 +03:00
sents = list ( doc . sents )
assert len ( sents ) == 0
2017-01-11 20:05:36 +03:00
def test_doc_api_runtime_error ( en_tokenizer ) :
2016-01-25 17:22:42 +03:00
# Example that caused run-time error while parsing Reddit
2018-11-27 03:09:36 +03:00
# fmt: off
2017-01-11 20:05:36 +03:00
text = " 67 % o f black households are single parent \n \n 72 % o f all black babies born out of wedlock \n \n 50 % o f all black kids don \u2019 t finish high school "
2020-03-02 13:49:28 +03:00
deps = [ " nummod " , " nsubj " , " prep " , " amod " , " pobj " , " ROOT " , " amod " , " attr " , " " , " nummod " , " appos " , " prep " , " det " ,
" amod " , " pobj " , " acl " , " prep " , " prep " , " pobj " ,
" " , " nummod " , " nsubj " , " prep " , " det " , " amod " , " pobj " , " aux " , " neg " , " ccomp " , " amod " , " dobj " ]
2018-11-27 03:09:36 +03:00
# fmt: on
2017-01-11 20:05:36 +03:00
tokens = en_tokenizer ( text )
2020-09-21 21:43:54 +03:00
doc = Doc ( tokens . vocab , words = [ t . text for t in tokens ] , deps = deps )
2016-01-25 17:22:42 +03:00
nps = [ ]
for np in doc . noun_chunks :
2018-11-27 03:09:36 +03:00
while len ( np ) > 1 and np [ 0 ] . dep_ not in ( " advmod " , " amod " , " compound " ) :
2016-01-25 17:22:42 +03:00
np = np [ 1 : ]
if len ( np ) > 1 :
2019-02-15 12:29:44 +03:00
nps . append ( np )
with doc . retokenize ( ) as retokenizer :
for np in nps :
attrs = {
" tag " : np . root . tag_ ,
" lemma " : np . text ,
" ent_type " : np . root . ent_type_ ,
}
retokenizer . merge ( np , attrs = attrs )
2016-02-07 01:47:51 +03:00
2020-09-21 21:43:54 +03:00
def test_doc_api_right_edge ( en_vocab ) :
2017-01-14 15:41:19 +03:00
""" Test for bug occurring from Unshift action, causing incorrect right edge """
2018-11-27 03:09:36 +03:00
# fmt: off
2020-09-21 21:43:54 +03:00
words = [
" I " , " have " , " proposed " , " to " , " myself " , " , " , " for " , " the " , " sake " ,
" of " , " such " , " as " , " live " , " under " , " the " , " government " , " of " , " the " ,
" Romans " , " , " , " to " , " translate " , " those " , " books " , " into " , " the " ,
" Greek " , " tongue " , " . "
]
heads = [ 2 , 2 , 2 , 2 , 3 , 2 , 21 , 8 , 6 , 8 , 11 , 8 , 11 , 12 , 15 , 13 , 15 , 18 , 16 , 12 , 21 , 2 , 23 , 21 , 21 , 27 , 27 , 24 , 2 ]
2020-09-17 01:14:01 +03:00
deps = [ " dep " ] * len ( heads )
2018-11-27 03:09:36 +03:00
# fmt: on
2020-09-21 21:43:54 +03:00
doc = Doc ( en_vocab , words = words , heads = heads , deps = deps )
2018-11-27 03:09:36 +03:00
assert doc [ 6 ] . text == " for "
2017-01-11 20:05:36 +03:00
subtree = [ w . text for w in doc [ 6 ] . subtree ]
2020-07-04 17:25:34 +03:00
# fmt: off
assert subtree == [ " for " , " the " , " sake " , " of " , " such " , " as " , " live " , " under " , " the " , " government " , " of " , " the " , " Romans " , " , " ]
# fmt: on
2018-11-27 03:09:36 +03:00
assert doc [ 6 ] . right_edge . text == " , "
2017-01-11 20:05:36 +03:00
2017-10-24 18:05:15 +03:00
def test_doc_api_has_vector ( ) :
vocab = Vocab ( )
2017-10-31 20:25:08 +03:00
vocab . reset_vectors ( width = 2 )
2018-11-27 03:09:36 +03:00
vocab . set_vector ( " kitten " , vector = numpy . asarray ( [ 0.0 , 2.0 ] , dtype = " f " ) )
doc = Doc ( vocab , words = [ " kitten " ] )
2016-05-09 13:36:14 +03:00
assert doc . has_vector
2016-10-16 21:20:23 +03:00
2018-01-15 18:29:48 +03:00
def test_doc_api_similarity_match ( ) :
2018-11-27 03:09:36 +03:00
doc = Doc ( Vocab ( ) , words = [ " a " ] )
2019-02-10 16:02:19 +03:00
assert doc . similarity ( doc [ 0 ] ) == 1.0
assert doc . similarity ( doc . vocab [ " a " ] ) == 1.0
2018-11-27 03:09:36 +03:00
doc2 = Doc ( doc . vocab , words = [ " a " , " b " , " c " ] )
2020-04-28 14:37:37 +03:00
with pytest . warns ( UserWarning ) :
2018-05-21 02:22:38 +03:00
assert doc . similarity ( doc2 [ : 1 ] ) == 1.0
assert doc . similarity ( doc2 ) == 0.0
2018-01-15 18:29:48 +03:00
2019-02-07 22:54:07 +03:00
@pytest.mark.parametrize (
2020-09-21 21:43:54 +03:00
" words,heads,lca_matrix " ,
2019-02-07 22:54:07 +03:00
[
(
2020-09-21 21:43:54 +03:00
[ " the " , " lazy " , " dog " , " slept " ] ,
[ 2 , 2 , 3 , 3 ] ,
2019-02-07 22:54:07 +03:00
numpy . array ( [ [ 0 , 2 , 2 , 3 ] , [ 2 , 1 , 2 , 3 ] , [ 2 , 2 , 2 , 3 ] , [ 3 , 3 , 3 , 3 ] ] ) ,
) ,
(
2020-09-21 21:43:54 +03:00
[ " The " , " lazy " , " dog " , " slept " , " . " , " The " , " quick " , " fox " , " jumped " ] ,
[ 2 , 2 , 3 , 3 , 3 , 7 , 7 , 8 , 8 ] ,
2019-02-07 22:54:07 +03:00
numpy . array (
[
[ 0 , 2 , 2 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 2 , 1 , 2 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 2 , 2 , 2 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 3 , 3 , 3 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 3 , 3 , 3 , 3 , 4 , - 1 , - 1 , - 1 , - 1 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 5 , 7 , 7 , 8 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 7 , 6 , 7 , 8 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 7 , 7 , 7 , 8 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 8 , 8 , 8 , 8 ] ,
]
) ,
) ,
] ,
)
2020-09-21 21:43:54 +03:00
def test_lowest_common_ancestor ( en_vocab , words , heads , lca_matrix ) :
doc = Doc ( en_vocab , words , heads = heads , deps = [ " dep " ] * len ( heads ) )
2017-10-20 21:28:00 +03:00
lca = doc . get_lca_matrix ( )
2019-01-06 21:07:50 +03:00
assert ( lca == lca_matrix ) . all ( )
2018-11-27 03:09:36 +03:00
assert lca [ 1 , 1 ] == 1
assert lca [ 0 , 1 ] == 2
assert lca [ 1 , 2 ] == 2
2019-03-10 17:24:34 +03:00
def test_doc_is_nered ( en_vocab ) :
words = [ " I " , " live " , " in " , " New " , " York " ]
doc = Doc ( en_vocab , words = words )
2020-09-17 01:14:01 +03:00
assert not doc . has_annotation ( " ENT_IOB " )
2019-03-10 17:24:34 +03:00
doc . ents = [ Span ( doc , 3 , 5 , label = " GPE " ) ]
2020-09-17 01:14:01 +03:00
assert doc . has_annotation ( " ENT_IOB " )
2019-03-10 17:24:34 +03:00
# Test creating doc from array with unknown values
arr = numpy . array ( [ [ 0 , 0 ] , [ 0 , 0 ] , [ 0 , 0 ] , [ 384 , 3 ] , [ 384 , 1 ] ] , dtype = " uint64 " )
doc = Doc ( en_vocab , words = words ) . from_array ( [ ENT_TYPE , ENT_IOB ] , arr )
2020-09-17 01:14:01 +03:00
assert doc . has_annotation ( " ENT_IOB " )
2019-03-10 17:24:34 +03:00
# Test serialization
new_doc = Doc ( en_vocab ) . from_bytes ( doc . to_bytes ( ) )
2020-09-17 01:14:01 +03:00
assert new_doc . has_annotation ( " ENT_IOB " )
2019-03-11 16:21:40 +03:00
2020-02-16 19:17:09 +03:00
def test_doc_from_array_sent_starts ( en_vocab ) :
2020-03-02 13:49:28 +03:00
# fmt: off
2020-09-21 21:43:54 +03:00
words = [ " I " , " live " , " in " , " New " , " York " , " . " , " I " , " like " , " cats " , " . " ]
heads = [ 0 , 0 , 0 , 0 , 0 , 0 , 6 , 6 , 6 , 6 ]
2020-09-16 21:32:38 +03:00
deps = [ " ROOT " , " dep " , " dep " , " dep " , " dep " , " dep " , " ROOT " , " dep " , " dep " , " dep " ]
2020-03-02 13:49:28 +03:00
# fmt: on
2020-09-21 21:43:54 +03:00
doc = Doc ( en_vocab , words = words , heads = heads , deps = deps )
2020-09-18 04:01:29 +03:00
# HEAD overrides SENT_START without warning
2020-02-16 19:17:09 +03:00
attrs = [ SENT_START , HEAD ]
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
2020-09-18 04:01:29 +03:00
new_doc . from_array ( attrs , arr )
2020-09-17 01:14:01 +03:00
# no warning using default attrs
attrs = doc . _get_array_attrs ( )
arr = doc . to_array ( attrs )
2022-08-22 13:04:30 +03:00
with warnings . catch_warnings ( ) :
warnings . simplefilter ( " error " )
2020-02-16 19:17:09 +03:00
new_doc . from_array ( attrs , arr )
2020-09-17 01:14:01 +03:00
# only SENT_START uses SENT_START
attrs = [ SENT_START ]
2020-02-16 19:17:09 +03:00
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
new_doc . from_array ( attrs , arr )
assert [ t . is_sent_start for t in doc ] == [ t . is_sent_start for t in new_doc ]
2020-09-17 01:14:01 +03:00
assert not new_doc . has_annotation ( " DEP " )
# only HEAD uses HEAD
2020-02-16 19:17:09 +03:00
attrs = [ HEAD , DEP ]
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
new_doc . from_array ( attrs , arr )
assert [ t . is_sent_start for t in doc ] == [ t . is_sent_start for t in new_doc ]
2020-09-17 01:14:01 +03:00
assert new_doc . has_annotation ( " DEP " )
2020-02-16 19:17:09 +03:00
2020-07-14 15:07:35 +03:00
def test_doc_from_array_morph ( en_vocab ) :
# fmt: off
2020-09-21 21:43:54 +03:00
words = [ " I " , " live " , " in " , " New " , " York " , " . " ]
2020-07-14 15:07:35 +03:00
morphs = [ " Feat1=A " , " Feat1=B " , " Feat1=C " , " Feat1=A|Feat2=D " , " Feat2=E " , " Feat3=F " ]
# fmt: on
2020-10-01 23:21:46 +03:00
doc = Doc ( en_vocab , words = words , morphs = morphs )
2020-07-14 15:07:35 +03:00
attrs = [ MORPH ]
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
new_doc . from_array ( attrs , arr )
2020-10-01 23:21:46 +03:00
assert [ str ( t . morph ) for t in new_doc ] == morphs
assert [ str ( t . morph ) for t in doc ] == [ str ( t . morph ) for t in new_doc ]
2020-07-14 15:07:35 +03:00
2021-03-30 10:49:12 +03:00
@pytest.mark.usefixtures ( " clean_underscore " )
2020-07-03 12:32:42 +03:00
def test_doc_api_from_docs ( en_tokenizer , de_tokenizer ) :
2021-06-23 16:51:35 +03:00
en_texts = [
" Merging the docs is fun. " ,
" " ,
" They don ' t think alike. " ,
2022-01-18 19:12:42 +03:00
" " ,
2021-06-23 16:51:35 +03:00
" Another doc. " ,
]
2020-09-03 11:09:03 +03:00
en_texts_without_empty = [ t for t in en_texts if len ( t ) ]
2020-07-03 12:32:42 +03:00
de_text = " Wie war die Frage? "
en_docs = [ en_tokenizer ( text ) for text in en_texts ]
2021-03-29 14:34:01 +03:00
en_docs [ 0 ] . spans [ " group " ] = [ en_docs [ 0 ] [ 1 : 4 ] ]
en_docs [ 2 ] . spans [ " group " ] = [ en_docs [ 2 ] [ 1 : 4 ] ]
2022-01-18 19:12:42 +03:00
en_docs [ 4 ] . spans [ " group " ] = [ en_docs [ 4 ] [ 0 : 1 ] ]
2021-06-23 16:51:35 +03:00
span_group_texts = sorted (
2022-01-18 19:12:42 +03:00
[ en_docs [ 0 ] [ 1 : 4 ] . text , en_docs [ 2 ] [ 1 : 4 ] . text , en_docs [ 4 ] [ 0 : 1 ] . text ]
2021-06-23 16:51:35 +03:00
)
2020-07-03 12:32:42 +03:00
de_doc = de_tokenizer ( de_text )
2021-03-30 10:49:12 +03:00
Token . set_extension ( " is_ambiguous " , default = False )
2021-06-23 16:51:35 +03:00
en_docs [ 0 ] [ 2 ] . _ . is_ambiguous = True # docs
en_docs [ 2 ] [ 3 ] . _ . is_ambiguous = True # think
2020-07-03 12:32:42 +03:00
assert Doc . from_docs ( [ ] ) is None
assert de_doc is not Doc . from_docs ( [ de_doc ] )
assert str ( de_doc ) == str ( Doc . from_docs ( [ de_doc ] ) )
with pytest . raises ( ValueError ) :
Doc . from_docs ( en_docs + [ de_doc ] )
m_doc = Doc . from_docs ( en_docs )
2020-09-03 11:09:03 +03:00
assert len ( en_texts_without_empty ) == len ( list ( m_doc . sents ) )
2021-06-23 16:51:35 +03:00
assert len ( m_doc . text ) > len ( en_texts [ 0 ] ) + len ( en_texts [ 1 ] )
assert m_doc . text == " " . join ( [ t . strip ( ) for t in en_texts_without_empty ] )
2020-07-04 17:25:34 +03:00
p_token = m_doc [ len ( en_docs [ 0 ] ) - 1 ]
2020-07-03 12:32:42 +03:00
assert p_token . text == " . " and bool ( p_token . whitespace_ )
en_docs_tokens = [ t for doc in en_docs for t in doc ]
assert len ( m_doc ) == len ( en_docs_tokens )
2020-09-03 11:09:03 +03:00
think_idx = len ( en_texts [ 0 ] ) + 1 + en_texts [ 2 ] . index ( " think " )
2021-06-28 13:03:29 +03:00
assert m_doc [ 2 ] . _ . is_ambiguous is True
2020-07-03 12:32:42 +03:00
assert m_doc [ 9 ] . idx == think_idx
2021-06-28 13:03:29 +03:00
assert m_doc [ 9 ] . _ . is_ambiguous is True
2021-03-30 10:49:12 +03:00
assert not any ( [ t . _ . is_ambiguous for t in m_doc [ 3 : 8 ] ] )
2021-03-29 14:34:01 +03:00
assert " group " in m_doc . spans
assert span_group_texts == sorted ( [ s . text for s in m_doc . spans [ " group " ] ] )
2021-06-23 16:51:35 +03:00
assert bool ( m_doc [ 11 ] . whitespace_ )
2020-07-03 12:32:42 +03:00
m_doc = Doc . from_docs ( en_docs , ensure_whitespace = False )
2020-09-03 11:09:03 +03:00
assert len ( en_texts_without_empty ) == len ( list ( m_doc . sents ) )
2021-06-23 16:51:35 +03:00
assert len ( m_doc . text ) == sum ( len ( t ) for t in en_texts )
assert m_doc . text == " " . join ( en_texts_without_empty )
2020-07-03 12:32:42 +03:00
p_token = m_doc [ len ( en_docs [ 0 ] ) - 1 ]
assert p_token . text == " . " and not bool ( p_token . whitespace_ )
en_docs_tokens = [ t for doc in en_docs for t in doc ]
assert len ( m_doc ) == len ( en_docs_tokens )
2020-09-03 11:09:03 +03:00
think_idx = len ( en_texts [ 0 ] ) + 0 + en_texts [ 2 ] . index ( " think " )
2020-07-03 12:32:42 +03:00
assert m_doc [ 9 ] . idx == think_idx
2021-03-29 14:34:01 +03:00
assert " group " in m_doc . spans
assert span_group_texts == sorted ( [ s . text for s in m_doc . spans [ " group " ] ] )
2021-06-23 16:51:35 +03:00
assert bool ( m_doc [ 11 ] . whitespace_ )
2020-07-03 12:32:42 +03:00
2020-07-04 17:25:34 +03:00
m_doc = Doc . from_docs ( en_docs , attrs = [ " lemma " , " length " , " pos " ] )
2021-06-23 16:51:35 +03:00
assert len ( m_doc . text ) > len ( en_texts [ 0 ] ) + len ( en_texts [ 1 ] )
2020-07-04 17:25:34 +03:00
# space delimiter considered, although spacy attribute was missing
2021-06-23 16:51:35 +03:00
assert m_doc . text == " " . join ( [ t . strip ( ) for t in en_texts_without_empty ] )
2020-07-03 12:32:42 +03:00
p_token = m_doc [ len ( en_docs [ 0 ] ) - 1 ]
assert p_token . text == " . " and bool ( p_token . whitespace_ )
en_docs_tokens = [ t for doc in en_docs for t in doc ]
assert len ( m_doc ) == len ( en_docs_tokens )
2020-09-03 11:09:03 +03:00
think_idx = len ( en_texts [ 0 ] ) + 1 + en_texts [ 2 ] . index ( " think " )
2020-07-03 12:32:42 +03:00
assert m_doc [ 9 ] . idx == think_idx
2021-03-29 14:34:01 +03:00
assert " group " in m_doc . spans
assert span_group_texts == sorted ( [ s . text for s in m_doc . spans [ " group " ] ] )
2020-07-03 12:32:42 +03:00
2022-04-25 19:19:03 +03:00
# can exclude spans
m_doc = Doc . from_docs ( en_docs , exclude = [ " spans " ] )
assert " group " not in m_doc . spans
# can exclude user_data
m_doc = Doc . from_docs ( en_docs , exclude = [ " user_data " ] )
assert m_doc . user_data == { }
2021-05-05 19:44:14 +03:00
# can merge empty docs
doc = Doc . from_docs ( [ en_tokenizer ( " " ) ] * 10 )
2021-06-23 16:51:35 +03:00
# empty but set spans keys are preserved
en_docs = [ en_tokenizer ( text ) for text in en_texts ]
m_doc = Doc . from_docs ( en_docs )
assert " group " not in m_doc . spans
for doc in en_docs :
doc . spans [ " group " ] = [ ]
m_doc = Doc . from_docs ( en_docs )
assert " group " in m_doc . spans
assert len ( m_doc . spans [ " group " ] ) == 0
2022-04-25 19:19:03 +03:00
# with tensor
ops = get_current_ops ( )
for doc in en_docs :
doc . tensor = ops . asarray ( [ [ len ( t . text ) , 0.0 ] for t in doc ] )
m_doc = Doc . from_docs ( en_docs )
assert_array_equal (
ops . to_numpy ( m_doc . tensor ) ,
ops . to_numpy ( ops . xp . vstack ( [ doc . tensor for doc in en_docs if len ( doc ) ] ) ) ,
)
# can exclude tensor
m_doc = Doc . from_docs ( en_docs , exclude = [ " tensor " ] )
assert m_doc . tensor . shape == ( 0 , )
2020-07-03 12:32:42 +03:00
2020-09-17 01:14:01 +03:00
def test_doc_api_from_docs_ents ( en_tokenizer ) :
texts = [ " Merging the docs is fun. " , " They don ' t think alike. " ]
docs = [ en_tokenizer ( t ) for t in texts ]
docs [ 0 ] . ents = ( )
docs [ 1 ] . ents = ( Span ( docs [ 1 ] , 0 , 1 , label = " foo " ) , )
doc = Doc . from_docs ( docs )
assert len ( doc . ents ) == 1
2019-03-11 16:21:40 +03:00
def test_doc_lang ( en_vocab ) :
doc = Doc ( en_vocab , words = [ " Hello " , " world " ] )
assert doc . lang_ == " en "
assert doc . lang == en_vocab . strings [ " en " ]
2020-07-22 14:42:59 +03:00
assert doc [ 0 ] . lang_ == " en "
assert doc [ 0 ] . lang == en_vocab . strings [ " en " ]
nlp = English ( )
doc = nlp ( " Hello world " )
assert doc . lang_ == " en "
assert doc . lang == en_vocab . strings [ " en " ]
assert doc [ 0 ] . lang_ == " en "
assert doc [ 0 ] . lang == en_vocab . strings [ " en " ]
2020-08-10 17:43:52 +03:00
def test_token_lexeme ( en_vocab ) :
""" Test that tokens expose their lexeme. """
token = Doc ( en_vocab , words = [ " Hello " , " world " ] ) [ 0 ]
assert isinstance ( token . lex , Lexeme )
assert token . lex . text == token . text
assert en_vocab [ token . orth ] == token . lex
2020-09-17 01:14:01 +03:00
def test_has_annotation ( en_vocab ) :
doc = Doc ( en_vocab , words = [ " Hello " , " world " ] )
attrs = ( " TAG " , " POS " , " MORPH " , " LEMMA " , " DEP " , " HEAD " , " ENT_IOB " , " ENT_TYPE " )
for attr in attrs :
assert not doc . has_annotation ( attr )
2022-02-08 10:35:37 +03:00
assert not doc . has_annotation ( attr , require_complete = True )
2020-09-17 01:14:01 +03:00
doc [ 0 ] . tag_ = " A "
doc [ 0 ] . pos_ = " X "
2020-10-01 23:21:46 +03:00
doc [ 0 ] . set_morph ( " Feat=Val " )
2020-09-17 01:14:01 +03:00
doc [ 0 ] . lemma_ = " a "
doc [ 0 ] . dep_ = " dep "
doc [ 0 ] . head = doc [ 1 ]
2020-09-21 16:54:05 +03:00
doc . set_ents ( [ Span ( doc , 0 , 1 , label = " HELLO " ) ] , default = " missing " )
2020-09-17 01:14:01 +03:00
for attr in attrs :
assert doc . has_annotation ( attr )
assert not doc . has_annotation ( attr , require_complete = True )
doc [ 1 ] . tag_ = " A "
doc [ 1 ] . pos_ = " X "
2020-10-01 23:21:46 +03:00
doc [ 1 ] . set_morph ( " " )
2020-09-17 01:14:01 +03:00
doc [ 1 ] . lemma_ = " a "
doc [ 1 ] . dep_ = " dep "
doc . ents = [ Span ( doc , 0 , 2 , label = " HELLO " ) ]
for attr in attrs :
assert doc . has_annotation ( attr )
assert doc . has_annotation ( attr , require_complete = True )
2022-02-08 10:35:37 +03:00
def test_has_annotation_sents ( en_vocab ) :
doc = Doc ( en_vocab , words = [ " Hello " , " beautiful " , " world " ] )
attrs = ( " SENT_START " , " IS_SENT_START " , " IS_SENT_END " )
for attr in attrs :
assert not doc . has_annotation ( attr )
assert not doc . has_annotation ( attr , require_complete = True )
# The first token (index 0) is always assumed to be a sentence start,
# and ignored by the check in doc.has_annotation
doc [ 1 ] . is_sent_start = False
for attr in attrs :
assert doc . has_annotation ( attr )
assert not doc . has_annotation ( attr , require_complete = True )
doc [ 2 ] . is_sent_start = False
for attr in attrs :
assert doc . has_annotation ( attr )
assert doc . has_annotation ( attr , require_complete = True )
2020-09-17 01:14:01 +03:00
def test_is_flags_deprecated ( en_tokenizer ) :
doc = en_tokenizer ( " test " )
with pytest . deprecated_call ( ) :
doc . is_tagged
with pytest . deprecated_call ( ) :
doc . is_parsed
with pytest . deprecated_call ( ) :
doc . is_nered
with pytest . deprecated_call ( ) :
doc . is_sentenced
2020-09-17 22:10:41 +03:00
2020-09-22 14:45:50 +03:00
def test_doc_set_ents ( en_tokenizer ) :
2020-09-21 16:54:05 +03:00
# set ents
2020-09-17 22:10:41 +03:00
doc = en_tokenizer ( " a b c d e " )
2020-09-21 16:54:05 +03:00
doc . set_ents ( [ Span ( doc , 0 , 1 , 10 ) , Span ( doc , 1 , 3 , 11 ) ] )
assert [ t . ent_iob for t in doc ] == [ 3 , 3 , 1 , 2 , 2 ]
assert [ t . ent_type for t in doc ] == [ 10 , 11 , 11 , 0 , 0 ]
# add ents, invalid IOB repaired
doc = en_tokenizer ( " a b c d e " )
doc . set_ents ( [ Span ( doc , 0 , 1 , 10 ) , Span ( doc , 1 , 3 , 11 ) ] )
doc . set_ents ( [ Span ( doc , 0 , 2 , 12 ) ] , default = " unmodified " )
assert [ t . ent_iob for t in doc ] == [ 3 , 1 , 3 , 2 , 2 ]
assert [ t . ent_type for t in doc ] == [ 12 , 12 , 11 , 0 , 0 ]
# missing ents
doc = en_tokenizer ( " a b c d e " )
doc . set_ents ( [ Span ( doc , 0 , 1 , 10 ) , Span ( doc , 1 , 3 , 11 ) ] , missing = [ doc [ 4 : 5 ] ] )
assert [ t . ent_iob for t in doc ] == [ 3 , 3 , 1 , 2 , 0 ]
assert [ t . ent_type for t in doc ] == [ 10 , 11 , 11 , 0 , 0 ]
# outside ents
doc = en_tokenizer ( " a b c d e " )
doc . set_ents (
[ Span ( doc , 0 , 1 , 10 ) , Span ( doc , 1 , 3 , 11 ) ] ,
outside = [ doc [ 4 : 5 ] ] ,
default = " missing " ,
)
assert [ t . ent_iob for t in doc ] == [ 3 , 3 , 1 , 0 , 2 ]
assert [ t . ent_type for t in doc ] == [ 10 , 11 , 11 , 0 , 0 ]
# blocked ents
doc = en_tokenizer ( " a b c d e " )
doc . set_ents ( [ ] , blocked = [ doc [ 1 : 2 ] , doc [ 3 : 5 ] ] , default = " unmodified " )
2020-09-17 22:10:41 +03:00
assert [ t . ent_iob for t in doc ] == [ 0 , 3 , 0 , 3 , 3 ]
assert [ t . ent_type for t in doc ] == [ 0 , 0 , 0 , 0 , 0 ]
assert doc . ents == tuple ( )
2020-09-21 16:54:05 +03:00
# invalid IOB repaired after blocked
2020-09-17 22:10:41 +03:00
doc . ents = [ Span ( doc , 3 , 5 , " ENT " ) ]
assert [ t . ent_iob for t in doc ] == [ 2 , 2 , 2 , 3 , 1 ]
2020-09-21 16:54:05 +03:00
doc . set_ents ( [ ] , blocked = [ doc [ 3 : 4 ] ] , default = " unmodified " )
2020-09-17 22:10:41 +03:00
assert [ t . ent_iob for t in doc ] == [ 2 , 2 , 2 , 3 , 3 ]
2020-09-21 16:54:05 +03:00
# all types
doc = en_tokenizer ( " a b c d e " )
doc . set_ents (
[ Span ( doc , 0 , 1 , 10 ) ] ,
blocked = [ doc [ 1 : 2 ] ] ,
missing = [ doc [ 2 : 3 ] ] ,
outside = [ doc [ 3 : 4 ] ] ,
default = " unmodified " ,
)
assert [ t . ent_iob for t in doc ] == [ 3 , 3 , 0 , 2 , 0 ]
assert [ t . ent_type for t in doc ] == [ 10 , 0 , 0 , 0 , 0 ]
doc = en_tokenizer ( " a b c d e " )
# single span instead of a list
with pytest . raises ( ValueError ) :
doc . set_ents ( [ ] , missing = doc [ 1 : 2 ] )
# invalid default mode
with pytest . raises ( ValueError ) :
doc . set_ents ( [ ] , missing = [ doc [ 1 : 2 ] ] , default = " none " )
# conflicting/overlapping specifications
with pytest . raises ( ValueError ) :
doc . set_ents ( [ ] , missing = [ doc [ 1 : 2 ] ] , outside = [ doc [ 1 : 2 ] ] )
2020-09-22 14:45:50 +03:00
def test_doc_ents_setter ( ) :
2020-09-22 10:15:57 +03:00
""" Test that both strings and integers can be used to set entities in
tuple format via doc . ents . """
words = [ " a " , " b " , " c " , " d " , " e " ]
doc = Doc ( Vocab ( ) , words = words )
doc . ents = [ ( " HELLO " , 0 , 2 ) , ( doc . vocab . strings . add ( " WORLD " ) , 3 , 5 ) ]
assert [ e . label_ for e in doc . ents ] == [ " HELLO " , " WORLD " ]
vocab = Vocab ( )
ents = [ ( " HELLO " , 0 , 2 ) , ( vocab . strings . add ( " WORLD " ) , 3 , 5 ) ]
2020-10-01 17:22:18 +03:00
ents = [ " B-HELLO " , " I-HELLO " , " O " , " B-WORLD " , " I-WORLD " ]
2020-09-22 10:15:57 +03:00
doc = Doc ( vocab , words = words , ents = ents )
2020-09-24 13:36:51 +03:00
assert [ e . label_ for e in doc . ents ] == [ " HELLO " , " WORLD " ]
2020-10-01 17:22:18 +03:00
2020-10-01 23:21:46 +03:00
def test_doc_morph_setter ( en_tokenizer , de_tokenizer ) :
doc1 = en_tokenizer ( " a b " )
doc1b = en_tokenizer ( " c d " )
doc2 = de_tokenizer ( " a b " )
# unset values can be copied
doc1 [ 0 ] . morph = doc1 [ 1 ] . morph
assert doc1 [ 0 ] . morph . key == 0
assert doc1 [ 1 ] . morph . key == 0
# morph values from the same vocab can be copied
doc1 [ 0 ] . set_morph ( " Feat=Val " )
doc1 [ 1 ] . morph = doc1 [ 0 ] . morph
assert doc1 [ 0 ] . morph == doc1 [ 1 ] . morph
# ... also across docs
doc1b [ 0 ] . morph = doc1 [ 0 ] . morph
assert doc1 [ 0 ] . morph == doc1b [ 0 ] . morph
doc2 [ 0 ] . set_morph ( " Feat2=Val2 " )
# the morph value must come from the same vocab
with pytest . raises ( ValueError ) :
doc1 [ 0 ] . morph = doc2 [ 0 ] . morph
2020-10-01 17:22:18 +03:00
def test_doc_init_iob ( ) :
""" Test ents validation/normalization in Doc.__init__ """
words = [ " a " , " b " , " c " , " d " , " e " ]
ents = [ " O " ] * len ( words )
doc = Doc ( Vocab ( ) , words = words , ents = ents )
assert doc . ents == ( )
ents = [ " B-PERSON " , " I-PERSON " , " O " , " I-PERSON " , " I-PERSON " ]
doc = Doc ( Vocab ( ) , words = words , ents = ents )
assert len ( doc . ents ) == 2
ents = [ " B-PERSON " , " I-PERSON " , " O " , " I-PERSON " , " I-GPE " ]
doc = Doc ( Vocab ( ) , words = words , ents = ents )
assert len ( doc . ents ) == 3
# None is missing
ents = [ " B-PERSON " , " I-PERSON " , " O " , None , " I-GPE " ]
doc = Doc ( Vocab ( ) , words = words , ents = ents )
assert len ( doc . ents ) == 2
# empty tag is missing
ents = [ " " , " B-PERSON " , " O " , " B-PERSON " , " I-PERSON " ]
doc = Doc ( Vocab ( ) , words = words , ents = ents )
assert len ( doc . ents ) == 2
# invalid IOB
ents = [ " Q-PERSON " , " I-PERSON " , " O " , " I-PERSON " , " I-GPE " ]
with pytest . raises ( ValueError ) :
doc = Doc ( Vocab ( ) , words = words , ents = ents )
# no dash
ents = [ " OPERSON " , " I-PERSON " , " O " , " I-PERSON " , " I-GPE " ]
with pytest . raises ( ValueError ) :
doc = Doc ( Vocab ( ) , words = words , ents = ents )
# no ent type
ents = [ " O " , " B- " , " O " , " I-PERSON " , " I-GPE " ]
with pytest . raises ( ValueError ) :
doc = Doc ( Vocab ( ) , words = words , ents = ents )
# not strings or None
ents = [ 0 , " B- " , " O " , " I-PERSON " , " I-GPE " ]
with pytest . raises ( ValueError ) :
doc = Doc ( Vocab ( ) , words = words , ents = ents )
2020-10-09 13:10:25 +03:00
2020-10-09 15:42:51 +03:00
def test_doc_set_ents_invalid_spans ( en_tokenizer ) :
2020-10-09 13:10:25 +03:00
doc = en_tokenizer ( " Some text about Colombia and the Czech Republic " )
spans = [ Span ( doc , 3 , 4 , label = " GPE " ) , Span ( doc , 6 , 8 , label = " GPE " ) ]
with doc . retokenize ( ) as retokenizer :
for span in spans :
retokenizer . merge ( span )
2020-10-09 15:42:51 +03:00
with pytest . raises ( IndexError ) :
doc . ents = spans
2021-01-14 09:30:41 +03:00
2021-01-17 14:56:05 +03:00
def test_doc_noun_chunks_not_implemented ( ) :
""" Test that a language without noun_chunk iterator, throws a NotImplementedError """
text = " Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat. "
nlp = MultiLanguage ( )
doc = nlp ( text )
with pytest . raises ( NotImplementedError ) :
2021-01-30 04:52:33 +03:00
_ = list ( doc . noun_chunks ) # noqa: F841
2021-01-17 14:56:05 +03:00
2021-01-14 09:30:41 +03:00
def test_span_groups ( en_tokenizer ) :
doc = en_tokenizer ( " Some text about Colombia and the Czech Republic " )
doc . spans [ " hi " ] = [ Span ( doc , 3 , 4 , label = " bye " ) ]
assert " hi " in doc . spans
assert " bye " not in doc . spans
assert len ( doc . spans [ " hi " ] ) == 1
assert doc . spans [ " hi " ] [ 0 ] . label_ == " bye "
doc . spans [ " hi " ] . append ( doc [ 0 : 3 ] )
assert len ( doc . spans [ " hi " ] ) == 2
assert doc . spans [ " hi " ] [ 1 ] . text == " Some text about "
assert [ span . text for span in doc . spans [ " hi " ] ] == [ " Colombia " , " Some text about " ]
assert not doc . spans [ " hi " ] . has_overlap
doc . ents = [ Span ( doc , 3 , 4 , label = " GPE " ) , Span ( doc , 6 , 8 , label = " GPE " ) ]
doc . spans [ " hi " ] . extend ( doc . ents )
assert len ( doc . spans [ " hi " ] ) == 4
assert [ span . label_ for span in doc . spans [ " hi " ] ] == [ " bye " , " " , " GPE " , " GPE " ]
assert doc . spans [ " hi " ] . has_overlap
del doc . spans [ " hi " ]
assert " hi " not in doc . spans
2021-02-28 04:32:48 +03:00
def test_doc_spans_copy ( en_tokenizer ) :
doc1 = en_tokenizer ( " Some text about Colombia and the Czech Republic " )
assert weakref . ref ( doc1 ) == doc1 . spans . doc_ref
doc2 = doc1 . copy ( )
assert weakref . ref ( doc2 ) == doc2 . spans . doc_ref
2022-05-12 11:06:25 +03:00
def test_doc_spans_setdefault ( en_tokenizer ) :
doc = en_tokenizer ( " Some text about Colombia and the Czech Republic " )
doc . spans . setdefault ( " key1 " )
assert len ( doc . spans [ " key1 " ] ) == 0
doc . spans . setdefault ( " key2 " , default = [ doc [ 0 : 1 ] ] )
assert len ( doc . spans [ " key2 " ] ) == 1
doc . spans . setdefault ( " key3 " , default = SpanGroup ( doc , spans = [ doc [ 0 : 1 ] , doc [ 1 : 2 ] ] ) )
assert len ( doc . spans [ " key3 " ] ) == 2
2022-11-23 15:09:32 +03:00
def test_doc_sentiment_from_bytes_v3_to_v4 ( ) :
""" Test if a doc with sentiment attribute created in v3.x works with ' .from_bytes ' in v4.x without throwing errors. The sentiment attribute was removed in v4 """
doc_bytes = b " \x89 \xa4 text \xa5 happy \xaa array_head \x9f GQACKOLMN \xcd \x01 \xc4 \xcd \x01 \xc6 I \xcd \x01 \xc5 JP \xaa array_body \x85 \xc4 \x02 nd \xc3 \xc4 \x04 type \xa3 <u8 \xc4 \x04 kind \xc4 \x00 \xc4 \x05 shape \x92 \x01 \x0f \xc4 \x04 data \xc4 x \x05 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \xa4 \x9a \xd3 \x17 \xca \xf0 b \x03 \xa4 \x9a \xd3 \x17 \xca \xf0 b \x03 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \x01 \x00 \x00 \x00 \x00 \x00 \x00 \x00 \xa9 sentiment \xcb ? \xf0 \x00 \x00 \x00 \x00 \x00 \x00 \xa6 tensor \x85 \xc4 \x02 nd \xc3 \xc4 \x04 type \xa3 <f4 \xc4 \x04 kind \xc4 \x00 \xc4 \x05 shape \x91 \x00 \xc4 \x04 data \xc4 \x00 \xa4 cats \x80 \xa5 spans \xc4 \x01 \x90 \xa7 strings \x92 \xa0 \xa5 happy \xb2 has_unknown_spaces \xc2 "
doc = Doc ( Vocab ( ) ) . from_bytes ( doc_bytes )
assert doc . text == " happy "
with pytest . raises ( AttributeError ) :
doc . sentiment == 1.0