2023-06-14 18:48:41 +03:00
import warnings
2021-02-28 04:32:48 +03:00
import weakref
2017-01-11 20:05:36 +03:00
import numpy
2021-12-04 22:34:48 +03:00
import pytest
2023-06-14 18:48:41 +03:00
from numpy . testing import assert_array_equal
2021-12-04 22:34:48 +03:00
from thinc . api import NumpyOps , get_current_ops
2021-01-17 14:56:05 +03:00
2023-06-14 18:48:41 +03:00
from spacy . attrs import (
DEP ,
ENT_IOB ,
ENT_TYPE ,
HEAD ,
IS_ALPHA ,
MORPH ,
POS ,
SENT_START ,
TAG ,
)
2021-12-04 22:34:48 +03:00
from spacy . lang . en import English
2021-01-17 14:56:05 +03:00
from spacy . lang . xx import MultiLanguage
2021-12-04 22:34:48 +03:00
from spacy . language import Language
from spacy . lexeme import Lexeme
2022-05-12 11:06:25 +03:00
from spacy . tokens import Doc , Span , SpanGroup , Token
2018-07-25 00:38:44 +03:00
from spacy . vocab import Vocab
2021-03-30 10:49:12 +03:00
from . test_underscore import clean_underscore # noqa: F401
2015-02-07 21:14:07 +03:00
2020-09-21 18:59:09 +03:00
def test_doc_api_init ( en_vocab ) :
2020-09-21 21:43:54 +03:00
words = [ " a " , " b " , " c " , " d " ]
heads = [ 0 , 0 , 2 , 2 ]
2020-09-21 18:59:09 +03:00
# set sent_start by sent_starts
2020-09-21 21:43:54 +03:00
doc = Doc ( en_vocab , words = words , sent_starts = [ True , False , True , False ] )
2020-09-21 18:59:09 +03:00
assert [ t . is_sent_start for t in doc ] == [ True , False , True , False ]
# set sent_start by heads
2020-09-21 21:43:54 +03:00
doc = Doc ( en_vocab , words = words , heads = heads , deps = [ " dep " ] * 4 )
2020-09-21 18:59:09 +03:00
assert [ t . is_sent_start for t in doc ] == [ True , False , True , False ]
# heads override sent_starts
doc = Doc (
2020-09-29 22:39:28 +03:00
en_vocab , words = words , sent_starts = [ True ] * 4 , heads = heads , deps = [ " dep " ] * 4
2020-09-21 18:59:09 +03:00
)
assert [ t . is_sent_start for t in doc ] == [ True , False , True , False ]
2017-01-11 20:05:36 +03:00
2015-02-07 21:14:07 +03:00
2021-12-04 22:34:48 +03:00
@pytest.mark.issue ( 1547 )
def test_issue1547 ( ) :
""" Test that entity labels still match after merging tokens. """
words = [ " \n " , " worda " , " . " , " \n " , " wordb " , " - " , " Biosphere " , " 2 " , " - " , " \n " ]
doc = Doc ( Vocab ( ) , words = words )
doc . ents = [ Span ( doc , 6 , 8 , label = doc . vocab . strings [ " PRODUCT " ] ) ]
with doc . retokenize ( ) as retokenizer :
retokenizer . merge ( doc [ 5 : 7 ] )
assert [ ent . text for ent in doc . ents ]
@pytest.mark.issue ( 1757 )
def test_issue1757 ( ) :
""" Test comparison against None doesn ' t cause segfault. """
doc = Doc ( Vocab ( ) , words = [ " a " , " b " , " c " ] )
assert not doc [ 0 ] < None
assert not doc [ 0 ] is None
assert doc [ 0 ] > = None
assert not doc [ : 2 ] < None
assert not doc [ : 2 ] is None
assert doc [ : 2 ] > = None
assert not doc . vocab [ " a " ] is None
assert not doc . vocab [ " a " ] < None
@pytest.mark.issue ( 2396 )
def test_issue2396 ( en_vocab ) :
words = [ " She " , " created " , " a " , " test " , " for " , " spacy " ]
heads = [ 1 , 1 , 3 , 1 , 3 , 4 ]
deps = [ " dep " ] * len ( heads )
matrix = numpy . array (
[
[ 0 , 1 , 1 , 1 , 1 , 1 ] ,
[ 1 , 1 , 1 , 1 , 1 , 1 ] ,
[ 1 , 1 , 2 , 3 , 3 , 3 ] ,
[ 1 , 1 , 3 , 3 , 3 , 3 ] ,
[ 1 , 1 , 3 , 3 , 4 , 4 ] ,
[ 1 , 1 , 3 , 3 , 4 , 5 ] ,
] ,
dtype = numpy . int32 ,
)
doc = Doc ( en_vocab , words = words , heads = heads , deps = deps )
span = doc [ : ]
assert ( doc . get_lca_matrix ( ) == matrix ) . all ( )
assert ( span . get_lca_matrix ( ) == matrix ) . all ( )
2022-09-26 16:58:21 +03:00
@pytest.mark.issue ( 11499 )
def test_init_args_unmodified ( en_vocab ) :
words = [ " A " , " sentence " ]
ents = [ " B-TYPE1 " , " " ]
sent_starts = [ True , False ]
Doc (
vocab = en_vocab ,
words = words ,
ents = ents ,
sent_starts = sent_starts ,
)
assert ents == [ " B-TYPE1 " , " " ]
assert sent_starts == [ True , False ]
2021-12-04 22:34:48 +03:00
@pytest.mark.parametrize ( " text " , [ " -0.23 " , " +123,456 " , " ±1 " ] )
@pytest.mark.parametrize ( " lang_cls " , [ English , MultiLanguage ] )
@pytest.mark.issue ( 2782 )
def test_issue2782 ( text , lang_cls ) :
""" Check that like_num handles + and - before number. """
nlp = lang_cls ( )
doc = nlp ( text )
assert len ( doc ) == 1
assert doc [ 0 ] . like_num
@pytest.mark.parametrize (
" sentence " ,
[
" The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction. " ,
" The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale ' s #1. " ,
" The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale ' s number one " ,
" Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions. " ,
" It was a missed assignment, but it shouldn ' t have resulted in a turnover ... " ,
] ,
)
@pytest.mark.issue ( 3869 )
def test_issue3869 ( sentence ) :
""" Test that the Doc ' s count_by function works consistently """
nlp = English ( )
doc = nlp ( sentence )
count = 0
for token in doc :
count + = token . is_alpha
assert count == doc . count_by ( IS_ALPHA ) . get ( 1 , 0 )
@pytest.mark.issue ( 3962 )
def test_issue3962 ( en_vocab ) :
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise . """
# fmt: off
words = [ " He " , " jests " , " at " , " scars " , " , " , " that " , " never " , " felt " , " a " , " wound " , " . " ]
heads = [ 1 , 7 , 1 , 2 , 7 , 7 , 7 , 7 , 9 , 7 , 7 ]
deps = [ " nsubj " , " ccomp " , " prep " , " pobj " , " punct " , " nsubj " , " neg " , " ROOT " , " det " , " dobj " , " punct " ]
# fmt: on
doc = Doc ( en_vocab , words = words , heads = heads , deps = deps )
span2 = doc [ 1 : 5 ] # "jests at scars ,"
doc2 = span2 . as_doc ( )
doc2_json = doc2 . to_json ( )
assert doc2_json
# head set to itself, being the new artificial root
assert doc2 [ 0 ] . head . text == " jests "
assert doc2 [ 0 ] . dep_ == " dep "
assert doc2 [ 1 ] . head . text == " jests "
assert doc2 [ 1 ] . dep_ == " prep "
assert doc2 [ 2 ] . head . text == " at "
assert doc2 [ 2 ] . dep_ == " pobj "
assert doc2 [ 3 ] . head . text == " jests " # head set to the new artificial root
assert doc2 [ 3 ] . dep_ == " dep "
# We should still have 1 sentence
assert len ( list ( doc2 . sents ) ) == 1
span3 = doc [ 6 : 9 ] # "never felt a"
doc3 = span3 . as_doc ( )
doc3_json = doc3 . to_json ( )
assert doc3_json
assert doc3 [ 0 ] . head . text == " felt "
assert doc3 [ 0 ] . dep_ == " neg "
assert doc3 [ 1 ] . head . text == " felt "
assert doc3 [ 1 ] . dep_ == " ROOT "
assert doc3 [ 2 ] . head . text == " felt " # head set to ancestor
assert doc3 [ 2 ] . dep_ == " dep "
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
assert len ( list ( doc3 . sents ) ) == 1
@pytest.mark.issue ( 3962 )
def test_issue3962_long ( en_vocab ) :
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise . """
# fmt: off
words = [ " He " , " jests " , " at " , " scars " , " . " , " They " , " never " , " felt " , " a " , " wound " , " . " ]
heads = [ 1 , 1 , 1 , 2 , 1 , 7 , 7 , 7 , 9 , 7 , 7 ]
deps = [ " nsubj " , " ROOT " , " prep " , " pobj " , " punct " , " nsubj " , " neg " , " ROOT " , " det " , " dobj " , " punct " ]
# fmt: on
two_sent_doc = Doc ( en_vocab , words = words , heads = heads , deps = deps )
span2 = two_sent_doc [ 1 : 7 ] # "jests at scars. They never"
doc2 = span2 . as_doc ( )
doc2_json = doc2 . to_json ( )
assert doc2_json
# head set to itself, being the new artificial root (in sentence 1)
assert doc2 [ 0 ] . head . text == " jests "
assert doc2 [ 0 ] . dep_ == " ROOT "
assert doc2 [ 1 ] . head . text == " jests "
assert doc2 [ 1 ] . dep_ == " prep "
assert doc2 [ 2 ] . head . text == " at "
assert doc2 [ 2 ] . dep_ == " pobj "
assert doc2 [ 3 ] . head . text == " jests "
assert doc2 [ 3 ] . dep_ == " punct "
# head set to itself, being the new artificial root (in sentence 2)
assert doc2 [ 4 ] . head . text == " They "
assert doc2 [ 4 ] . dep_ == " dep "
# head set to the new artificial head (in sentence 2)
assert doc2 [ 4 ] . head . text == " They "
assert doc2 [ 4 ] . dep_ == " dep "
# We should still have 2 sentences
sents = list ( doc2 . sents )
assert len ( sents ) == 2
assert sents [ 0 ] . text == " jests at scars . "
assert sents [ 1 ] . text == " They never "
@Language.factory ( " my_pipe " )
class CustomPipe :
def __init__ ( self , nlp , name = " my_pipe " ):
self . name = name
Span . set_extension ( " my_ext " , getter = self . _get_my_ext )
Doc . set_extension ( " my_ext " , default = None )
def __call__ ( self , doc ) :
gathered_ext = [ ]
for sent in doc . sents :
sent_ext = self . _get_my_ext ( sent )
sent . _ . set ( " my_ext " , sent_ext )
gathered_ext . append ( sent_ext )
doc . _ . set ( " my_ext " , " \n " . join ( gathered_ext ) )
return doc
@staticmethod
def _get_my_ext ( span ) :
return str ( span . end )
@pytest.mark.issue ( 4903 )
def test_issue4903 ( ) :
""" Ensure that this runs correctly and doesn ' t hang or crash on Windows /
macOS . """
nlp = English ( )
nlp . add_pipe ( " sentencizer " )
nlp . add_pipe ( " my_pipe " , after = " sentencizer " )
text = [ " I like bananas. " , " Do you like them? " , " No, I prefer wasabi. " ]
if isinstance ( get_current_ops ( ) , NumpyOps ) :
docs = list ( nlp . pipe ( text , n_process = 2 ) )
assert docs [ 0 ] . text == " I like bananas. "
assert docs [ 1 ] . text == " Do you like them? "
assert docs [ 2 ] . text == " No, I prefer wasabi. "
@pytest.mark.issue ( 5048 )
def test_issue5048 ( en_vocab ) :
words = [ " This " , " is " , " a " , " sentence " ]
pos_s = [ " DET " , " VERB " , " DET " , " NOUN " ]
spaces = [ " " , " " , " " , " " ]
deps_s = [ " dep " , " adj " , " nn " , " atm " ]
tags_s = [ " DT " , " VBZ " , " DT " , " NN " ]
strings = en_vocab . strings
for w in words :
strings . add ( w )
deps = [ strings . add ( d ) for d in deps_s ]
pos = [ strings . add ( p ) for p in pos_s ]
tags = [ strings . add ( t ) for t in tags_s ]
attrs = [ POS , DEP , TAG ]
array = numpy . array ( list ( zip ( pos , deps , tags ) ) , dtype = " uint64 " )
doc = Doc ( en_vocab , words = words , spaces = spaces )
doc . from_array ( attrs , array )
v1 = [ ( token . text , token . pos_ , token . tag_ ) for token in doc ]
doc2 = Doc ( en_vocab , words = words , pos = pos_s , deps = deps_s , tags = tags_s )
v2 = [ ( token . text , token . pos_ , token . tag_ ) for token in doc2 ]
assert v1 == v2
2018-11-27 03:09:36 +03:00
@pytest.mark.parametrize ( " text " , [ [ " one " , " two " , " three " ] ] )
2017-01-11 20:05:36 +03:00
def test_doc_api_compare_by_string_position ( en_vocab , text ) :
2018-07-25 00:38:44 +03:00
doc = Doc ( en_vocab , words = text )
2017-01-09 21:12:00 +03:00
# Get the tokens in this order, so their ID ordering doesn't match the idx
2017-01-11 20:05:36 +03:00
token3 = doc [ - 1 ]
token2 = doc [ - 2 ]
token1 = doc [ - 1 ]
token1 , token2 , token3 = doc
assert token1 < token2 < token3
assert not token1 > token2
assert token2 > token1
assert token2 < = token3
assert token3 > = token1
def test_doc_api_getitem ( en_tokenizer ) :
text = " Give it back! He pleaded. "
tokens = en_tokenizer ( text )
2018-11-27 03:09:36 +03:00
assert tokens [ 0 ] . text == " Give "
assert tokens [ - 1 ] . text == " . "
2015-02-07 21:14:07 +03:00
with pytest . raises ( IndexError ) :
tokens [ len ( tokens ) ]
2015-07-13 19:39:38 +03:00
2015-10-06 11:59:11 +03:00
def to_str ( span ) :
2018-11-27 03:09:36 +03:00
return " / " . join ( token . text for token in span )
2015-10-06 11:59:11 +03:00
2015-10-06 10:51:25 +03:00
span = tokens [ 1 : 1 ]
2015-10-06 11:59:11 +03:00
assert not to_str ( span )
2015-10-06 10:51:25 +03:00
span = tokens [ 1 : 4 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " it/back/! "
2015-10-06 10:56:33 +03:00
span = tokens [ 1 : 4 : 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " it/back/! "
2015-10-06 10:51:25 +03:00
with pytest . raises ( ValueError ) :
tokens [ 1 : 4 : 2 ]
with pytest . raises ( ValueError ) :
tokens [ 1 : 4 : - 1 ]
2015-10-06 11:59:11 +03:00
span = tokens [ - 3 : 6 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded "
2015-10-06 11:59:11 +03:00
span = tokens [ 4 : - 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded "
2015-10-06 11:59:11 +03:00
span = tokens [ - 5 : - 3 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ 5 : 4 ]
assert span . start == span . end == 5 and not to_str ( span )
span = tokens [ 4 : - 3 ]
assert span . start == span . end == 4 and not to_str ( span )
span = tokens [ : ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/!/He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ 4 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ : 4 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ : - 3 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ - 3 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ 4 : 50 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " He/pleaded/. "
2015-10-06 11:59:11 +03:00
span = tokens [ - 50 : 4 ]
2018-11-27 03:09:36 +03:00
assert to_str ( span ) == " Give/it/back/! "
2015-10-06 11:59:11 +03:00
span = tokens [ - 50 : - 40 ]
assert span . start == span . end == 0 and not to_str ( span )
span = tokens [ 40 : 50 ]
assert span . start == span . end == 7 and not to_str ( span )
2015-10-06 12:08:39 +03:00
span = tokens [ 1 : 4 ]
2018-11-27 03:09:36 +03:00
assert span [ 0 ] . orth_ == " it "
2015-10-06 12:45:49 +03:00
subspan = span [ : ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ : 2 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back "
2015-10-06 12:45:49 +03:00
subspan = span [ 1 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ : - 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back "
2015-10-06 12:45:49 +03:00
subspan = span [ - 2 : ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ 1 : 2 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back "
2015-10-06 12:45:49 +03:00
subspan = span [ - 2 : - 1 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " back "
2015-10-06 12:45:49 +03:00
subspan = span [ - 50 : 50 ]
2018-11-27 03:09:36 +03:00
assert to_str ( subspan ) == " it/back/! "
2015-10-06 12:45:49 +03:00
subspan = span [ 50 : - 50 ]
assert subspan . start == subspan . end == 4 and not to_str ( subspan )
2015-10-06 12:08:39 +03:00
2015-07-13 19:39:38 +03:00
2018-11-27 03:09:36 +03:00
@pytest.mark.parametrize (
" text " , [ " Give it back! He pleaded. " , " Give it back! He pleaded. " ]
)
2017-01-11 20:05:36 +03:00
def test_doc_api_serialize ( en_tokenizer , text ) :
tokens = en_tokenizer ( text )
2020-07-02 18:11:57 +03:00
tokens [ 0 ] . lemma_ = " lemma "
tokens [ 0 ] . norm_ = " norm "
2020-09-17 01:14:01 +03:00
tokens . ents = [ ( tokens . vocab . strings [ " PRODUCT " ] , 0 , 1 ) ]
2020-07-02 18:11:57 +03:00
tokens [ 0 ] . ent_kb_id_ = " ent_kb_id "
2020-11-10 15:16:07 +03:00
tokens [ 0 ] . ent_id_ = " ent_id "
2018-07-25 00:38:44 +03:00
new_tokens = Doc ( tokens . vocab ) . from_bytes ( tokens . to_bytes ( ) )
2017-05-31 00:34:23 +03:00
assert tokens . text == new_tokens . text
2017-01-11 20:05:36 +03:00
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
2015-07-13 19:39:38 +03:00
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2020-07-02 18:11:57 +03:00
assert new_tokens [ 0 ] . lemma_ == " lemma "
assert new_tokens [ 0 ] . norm_ == " norm "
assert new_tokens [ 0 ] . ent_kb_id_ == " ent_kb_id "
2020-11-10 15:16:07 +03:00
assert new_tokens [ 0 ] . ent_id_ == " ent_id "
2015-08-06 01:35:40 +03:00
2018-07-25 00:38:44 +03:00
new_tokens = Doc ( tokens . vocab ) . from_bytes (
2019-03-10 21:16:45 +03:00
tokens . to_bytes ( exclude = [ " tensor " ] ) , exclude = [ " tensor " ]
2018-11-27 03:09:36 +03:00
)
2018-05-01 14:40:22 +03:00
assert tokens . text == new_tokens . text
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2018-07-25 00:38:44 +03:00
new_tokens = Doc ( tokens . vocab ) . from_bytes (
2019-03-10 21:16:45 +03:00
tokens . to_bytes ( exclude = [ " sentiment " ] ) , exclude = [ " sentiment " ]
2018-11-27 03:09:36 +03:00
)
2018-05-01 14:40:22 +03:00
assert tokens . text == new_tokens . text
assert [ t . text for t in tokens ] == [ t . text for t in new_tokens ]
assert [ t . orth for t in tokens ] == [ t . orth for t in new_tokens ]
2020-12-29 13:54:32 +03:00
def inner_func ( d1 , d2 ) :
return " hello! "
2021-06-04 18:44:04 +03:00
_ = tokens . to_bytes ( ) # noqa: F841
with pytest . warns ( UserWarning ) :
2020-12-29 13:54:32 +03:00
tokens . user_hooks [ " similarity " ] = inner_func
2021-01-05 05:41:53 +03:00
_ = tokens . to_bytes ( ) # noqa: F841
2020-12-29 13:54:32 +03:00
2015-08-06 01:35:40 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_set_ents ( en_tokenizer ) :
text = " I use goggle chrone to surf the web "
tokens = en_tokenizer ( text )
2015-08-06 01:35:40 +03:00
assert len ( tokens . ents ) == 0
2018-11-27 03:09:36 +03:00
tokens . ents = [ ( tokens . vocab . strings [ " PRODUCT " ] , 2 , 4 ) ]
2015-08-06 01:35:40 +03:00
assert len ( list ( tokens . ents ) ) == 1
2020-09-17 22:10:41 +03:00
assert [ t . ent_iob for t in tokens ] == [ 2 , 2 , 3 , 1 , 2 , 2 , 2 , 2 ]
2018-11-27 03:09:36 +03:00
assert tokens . ents [ 0 ] . label_ == " PRODUCT "
2017-01-11 20:05:36 +03:00
assert tokens . ents [ 0 ] . start == 2
assert tokens . ents [ 0 ] . end == 4
2015-10-18 09:17:27 +03:00
2017-01-11 20:05:36 +03:00
def test_doc_api_sents_empty_string ( en_tokenizer ) :
doc = en_tokenizer ( " " )
2016-09-27 19:49:14 +03:00
sents = list ( doc . sents )
assert len ( sents ) == 0
2017-01-11 20:05:36 +03:00
def test_doc_api_runtime_error ( en_tokenizer ) :
2016-01-25 17:22:42 +03:00
# Example that caused run-time error while parsing Reddit
2018-11-27 03:09:36 +03:00
# fmt: off
2017-01-11 20:05:36 +03:00
text = " 67 % o f black households are single parent \n \n 72 % o f all black babies born out of wedlock \n \n 50 % o f all black kids don \u2019 t finish high school "
2020-03-02 13:49:28 +03:00
deps = [ " nummod " , " nsubj " , " prep " , " amod " , " pobj " , " ROOT " , " amod " , " attr " , " " , " nummod " , " appos " , " prep " , " det " ,
" amod " , " pobj " , " acl " , " prep " , " prep " , " pobj " ,
" " , " nummod " , " nsubj " , " prep " , " det " , " amod " , " pobj " , " aux " , " neg " , " ccomp " , " amod " , " dobj " ]
2018-11-27 03:09:36 +03:00
# fmt: on
2017-01-11 20:05:36 +03:00
tokens = en_tokenizer ( text )
2020-09-21 21:43:54 +03:00
doc = Doc ( tokens . vocab , words = [ t . text for t in tokens ] , deps = deps )
2016-01-25 17:22:42 +03:00
nps = [ ]
for np in doc . noun_chunks :
2018-11-27 03:09:36 +03:00
while len ( np ) > 1 and np [ 0 ] . dep_ not in ( " advmod " , " amod " , " compound " ) :
2016-01-25 17:22:42 +03:00
np = np [ 1 : ]
if len ( np ) > 1 :
2019-02-15 12:29:44 +03:00
nps . append ( np )
with doc . retokenize ( ) as retokenizer :
for np in nps :
attrs = {
" tag " : np . root . tag_ ,
" lemma " : np . text ,
" ent_type " : np . root . ent_type_ ,
}
retokenizer . merge ( np , attrs = attrs )
2016-02-07 01:47:51 +03:00
2020-09-21 21:43:54 +03:00
def test_doc_api_right_edge ( en_vocab ) :
2017-01-14 15:41:19 +03:00
""" Test for bug occurring from Unshift action, causing incorrect right edge """
2018-11-27 03:09:36 +03:00
# fmt: off
2020-09-21 21:43:54 +03:00
words = [
" I " , " have " , " proposed " , " to " , " myself " , " , " , " for " , " the " , " sake " ,
" of " , " such " , " as " , " live " , " under " , " the " , " government " , " of " , " the " ,
" Romans " , " , " , " to " , " translate " , " those " , " books " , " into " , " the " ,
" Greek " , " tongue " , " . "
]
heads = [ 2 , 2 , 2 , 2 , 3 , 2 , 21 , 8 , 6 , 8 , 11 , 8 , 11 , 12 , 15 , 13 , 15 , 18 , 16 , 12 , 21 , 2 , 23 , 21 , 21 , 27 , 27 , 24 , 2 ]
2020-09-17 01:14:01 +03:00
deps = [ " dep " ] * len ( heads )
2018-11-27 03:09:36 +03:00
# fmt: on
2020-09-21 21:43:54 +03:00
doc = Doc ( en_vocab , words = words , heads = heads , deps = deps )
2018-11-27 03:09:36 +03:00
assert doc [ 6 ] . text == " for "
2017-01-11 20:05:36 +03:00
subtree = [ w . text for w in doc [ 6 ] . subtree ]
2020-07-04 17:25:34 +03:00
# fmt: off
assert subtree == [ " for " , " the " , " sake " , " of " , " such " , " as " , " live " , " under " , " the " , " government " , " of " , " the " , " Romans " , " , " ]
# fmt: on
2018-11-27 03:09:36 +03:00
assert doc [ 6 ] . right_edge . text == " , "
2017-01-11 20:05:36 +03:00
2017-10-24 18:05:15 +03:00
def test_doc_api_has_vector ( ) :
vocab = Vocab ( )
2017-10-31 20:25:08 +03:00
vocab . reset_vectors ( width = 2 )
2018-11-27 03:09:36 +03:00
vocab . set_vector ( " kitten " , vector = numpy . asarray ( [ 0.0 , 2.0 ] , dtype = " f " ) )
doc = Doc ( vocab , words = [ " kitten " ] )
2016-05-09 13:36:14 +03:00
assert doc . has_vector
2016-10-16 21:20:23 +03:00
2018-01-15 18:29:48 +03:00
def test_doc_api_similarity_match ( ) :
2018-11-27 03:09:36 +03:00
doc = Doc ( Vocab ( ) , words = [ " a " ] )
2019-02-10 16:02:19 +03:00
assert doc . similarity ( doc [ 0 ] ) == 1.0
assert doc . similarity ( doc . vocab [ " a " ] ) == 1.0
2018-11-27 03:09:36 +03:00
doc2 = Doc ( doc . vocab , words = [ " a " , " b " , " c " ] )
2020-04-28 14:37:37 +03:00
with pytest . warns ( UserWarning ) :
2018-05-21 02:22:38 +03:00
assert doc . similarity ( doc2 [ : 1 ] ) == 1.0
assert doc . similarity ( doc2 ) == 0.0
2018-01-15 18:29:48 +03:00
2019-02-07 22:54:07 +03:00
@pytest.mark.parametrize (
2020-09-21 21:43:54 +03:00
" words,heads,lca_matrix " ,
2019-02-07 22:54:07 +03:00
[
(
2020-09-21 21:43:54 +03:00
[ " the " , " lazy " , " dog " , " slept " ] ,
[ 2 , 2 , 3 , 3 ] ,
2019-02-07 22:54:07 +03:00
numpy . array ( [ [ 0 , 2 , 2 , 3 ] , [ 2 , 1 , 2 , 3 ] , [ 2 , 2 , 2 , 3 ] , [ 3 , 3 , 3 , 3 ] ] ) ,
) ,
(
2020-09-21 21:43:54 +03:00
[ " The " , " lazy " , " dog " , " slept " , " . " , " The " , " quick " , " fox " , " jumped " ] ,
[ 2 , 2 , 3 , 3 , 3 , 7 , 7 , 8 , 8 ] ,
2019-02-07 22:54:07 +03:00
numpy . array (
[
[ 0 , 2 , 2 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 2 , 1 , 2 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 2 , 2 , 2 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 3 , 3 , 3 , 3 , 3 , - 1 , - 1 , - 1 , - 1 ] ,
[ 3 , 3 , 3 , 3 , 4 , - 1 , - 1 , - 1 , - 1 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 5 , 7 , 7 , 8 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 7 , 6 , 7 , 8 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 7 , 7 , 7 , 8 ] ,
[ - 1 , - 1 , - 1 , - 1 , - 1 , 8 , 8 , 8 , 8 ] ,
]
) ,
) ,
] ,
)
2020-09-21 21:43:54 +03:00
def test_lowest_common_ancestor ( en_vocab , words , heads , lca_matrix ) :
doc = Doc ( en_vocab , words , heads = heads , deps = [ " dep " ] * len ( heads ) )
2017-10-20 21:28:00 +03:00
lca = doc . get_lca_matrix ( )
2019-01-06 21:07:50 +03:00
assert ( lca == lca_matrix ) . all ( )
2018-11-27 03:09:36 +03:00
assert lca [ 1 , 1 ] == 1
assert lca [ 0 , 1 ] == 2
assert lca [ 1 , 2 ] == 2
2019-03-10 17:24:34 +03:00
def test_doc_is_nered ( en_vocab ) :
words = [ " I " , " live " , " in " , " New " , " York " ]
doc = Doc ( en_vocab , words = words )
2020-09-17 01:14:01 +03:00
assert not doc . has_annotation ( " ENT_IOB " )
2019-03-10 17:24:34 +03:00
doc . ents = [ Span ( doc , 3 , 5 , label = " GPE " ) ]
2020-09-17 01:14:01 +03:00
assert doc . has_annotation ( " ENT_IOB " )
2019-03-10 17:24:34 +03:00
# Test creating doc from array with unknown values
arr = numpy . array ( [ [ 0 , 0 ] , [ 0 , 0 ] , [ 0 , 0 ] , [ 384 , 3 ] , [ 384 , 1 ] ] , dtype = " uint64 " )
doc = Doc ( en_vocab , words = words ) . from_array ( [ ENT_TYPE , ENT_IOB ] , arr )
2020-09-17 01:14:01 +03:00
assert doc . has_annotation ( " ENT_IOB " )
2019-03-10 17:24:34 +03:00
# Test serialization
new_doc = Doc ( en_vocab ) . from_bytes ( doc . to_bytes ( ) )
2020-09-17 01:14:01 +03:00
assert new_doc . has_annotation ( " ENT_IOB " )
2019-03-11 16:21:40 +03:00
2020-02-16 19:17:09 +03:00
def test_doc_from_array_sent_starts ( en_vocab ) :
2020-03-02 13:49:28 +03:00
# fmt: off
2020-09-21 21:43:54 +03:00
words = [ " I " , " live " , " in " , " New " , " York " , " . " , " I " , " like " , " cats " , " . " ]
heads = [ 0 , 0 , 0 , 0 , 0 , 0 , 6 , 6 , 6 , 6 ]
2020-09-16 21:32:38 +03:00
deps = [ " ROOT " , " dep " , " dep " , " dep " , " dep " , " dep " , " ROOT " , " dep " , " dep " , " dep " ]
2020-03-02 13:49:28 +03:00
# fmt: on
2020-09-21 21:43:54 +03:00
doc = Doc ( en_vocab , words = words , heads = heads , deps = deps )
2020-09-18 04:01:29 +03:00
# HEAD overrides SENT_START without warning
2020-02-16 19:17:09 +03:00
attrs = [ SENT_START , HEAD ]
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
2020-09-18 04:01:29 +03:00
new_doc . from_array ( attrs , arr )
2020-09-17 01:14:01 +03:00
# no warning using default attrs
attrs = doc . _get_array_attrs ( )
arr = doc . to_array ( attrs )
2022-08-22 13:04:30 +03:00
with warnings . catch_warnings ( ) :
warnings . simplefilter ( " error " )
2020-02-16 19:17:09 +03:00
new_doc . from_array ( attrs , arr )
2020-09-17 01:14:01 +03:00
# only SENT_START uses SENT_START
attrs = [ SENT_START ]
2020-02-16 19:17:09 +03:00
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
new_doc . from_array ( attrs , arr )
assert [ t . is_sent_start for t in doc ] == [ t . is_sent_start for t in new_doc ]
2020-09-17 01:14:01 +03:00
assert not new_doc . has_annotation ( " DEP " )
# only HEAD uses HEAD
2020-02-16 19:17:09 +03:00
attrs = [ HEAD , DEP ]
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
new_doc . from_array ( attrs , arr )
assert [ t . is_sent_start for t in doc ] == [ t . is_sent_start for t in new_doc ]
2020-09-17 01:14:01 +03:00
assert new_doc . has_annotation ( " DEP " )
2020-02-16 19:17:09 +03:00
2020-07-14 15:07:35 +03:00
def test_doc_from_array_morph ( en_vocab ) :
# fmt: off
2020-09-21 21:43:54 +03:00
words = [ " I " , " live " , " in " , " New " , " York " , " . " ]
2020-07-14 15:07:35 +03:00
morphs = [ " Feat1=A " , " Feat1=B " , " Feat1=C " , " Feat1=A|Feat2=D " , " Feat2=E " , " Feat3=F " ]
# fmt: on
2020-10-01 23:21:46 +03:00
doc = Doc ( en_vocab , words = words , morphs = morphs )
2020-07-14 15:07:35 +03:00
attrs = [ MORPH ]
arr = doc . to_array ( attrs )
new_doc = Doc ( en_vocab , words = words )
new_doc . from_array ( attrs , arr )
2020-10-01 23:21:46 +03:00
assert [ str ( t . morph ) for t in new_doc ] == morphs
assert [ str ( t . morph ) for t in doc ] == [ str ( t . morph ) for t in new_doc ]
2020-07-14 15:07:35 +03:00
2021-03-30 10:49:12 +03:00
@pytest.mark.usefixtures ( " clean_underscore " )
2020-07-03 12:32:42 +03:00
def test_doc_api_from_docs ( en_tokenizer , de_tokenizer ) :
2021-06-23 16:51:35 +03:00
en_texts = [
" Merging the docs is fun. " ,
" " ,
" They don ' t think alike. " ,
2022-01-18 19:12:42 +03:00
" " ,
2021-06-23 16:51:35 +03:00
" Another doc. " ,
]
2020-09-03 11:09:03 +03:00
en_texts_without_empty = [ t for t in en_texts if len ( t ) ]
2020-07-03 12:32:42 +03:00
de_text = " Wie war die Frage? "
en_docs = [ en_tokenizer ( text ) for text in en_texts ]
2021-03-29 14:34:01 +03:00
en_docs [ 0 ] . spans [ " group " ] = [ en_docs [ 0 ] [ 1 : 4 ] ]
en_docs [ 2 ] . spans [ " group " ] = [ en_docs [ 2 ] [ 1 : 4 ] ]
2022-01-18 19:12:42 +03:00
en_docs [ 4 ] . spans [ " group " ] = [ en_docs [ 4 ] [ 0 : 1 ] ]
2021-06-23 16:51:35 +03:00
span_group_texts = sorted (
2022-01-18 19:12:42 +03:00
[ en_docs [ 0 ] [ 1 : 4 ] . text , en_docs [ 2 ] [ 1 : 4 ] . text , en_docs [ 4 ] [ 0 : 1 ] . text ]
2021-06-23 16:51:35 +03:00
)
2020-07-03 12:32:42 +03:00
de_doc = de_tokenizer ( de_text )
2021-03-30 10:49:12 +03:00
Token . set_extension ( " is_ambiguous " , default = False )
2021-06-23 16:51:35 +03:00
en_docs [ 0 ] [ 2 ] . _ . is_ambiguous = True # docs
en_docs [ 2 ] [ 3 ] . _ . is_ambiguous = True # think
2020-07-03 12:32:42 +03:00
assert Doc . from_docs ( [ ] ) is None
assert de_doc is not Doc . from_docs ( [ de_doc ] )
assert str ( de_doc ) == str ( Doc . from_docs ( [ de_doc ] ) )
with pytest . raises ( ValueError ) :
Doc . from_docs ( en_docs + [ de_doc ] )
m_doc = Doc . from_docs ( en_docs )
2020-09-03 11:09:03 +03:00
assert len ( en_texts_without_empty ) == len ( list ( m_doc . sents ) )
2021-06-23 16:51:35 +03:00
assert len ( m_doc . text ) > len ( en_texts [ 0 ] ) + len ( en_texts [ 1 ] )
assert m_doc . text == " " . join ( [ t . strip ( ) for t in en_texts_without_empty ] )
2020-07-04 17:25:34 +03:00
p_token = m_doc [ len ( en_docs [ 0 ] ) - 1 ]
2020-07-03 12:32:42 +03:00
assert p_token . text == " . " and bool ( p_token . whitespace_ )
en_docs_tokens = [ t for doc in en_docs for t in doc ]
assert len ( m_doc ) == len ( en_docs_tokens )
2020-09-03 11:09:03 +03:00
think_idx = len ( en_texts [ 0 ] ) + 1 + en_texts [ 2 ] . index ( " think " )
2021-06-28 13:03:29 +03:00
assert m_doc [ 2 ] . _ . is_ambiguous is True
2020-07-03 12:32:42 +03:00
assert m_doc [ 9 ] . idx == think_idx
2021-06-28 13:03:29 +03:00
assert m_doc [ 9 ] . _ . is_ambiguous is True
2021-03-30 10:49:12 +03:00
assert not any ( [ t . _ . is_ambiguous for t in m_doc [ 3 : 8 ] ] )
2021-03-29 14:34:01 +03:00
assert " group " in m_doc . spans
assert span_group_texts == sorted ( [ s . text for s in m_doc . spans [ " group " ] ] )
2021-06-23 16:51:35 +03:00
assert bool ( m_doc [ 11 ] . whitespace_ )
2020-07-03 12:32:42 +03:00
m_doc = Doc . from_docs ( en_docs , ensure_whitespace = False )
2020-09-03 11:09:03 +03:00
assert len ( en_texts_without_empty ) == len ( list ( m_doc . sents ) )
2021-06-23 16:51:35 +03:00
assert len ( m_doc . text ) == sum ( len ( t ) for t in en_texts )
assert m_doc . text == " " . join ( en_texts_without_empty )
2020-07-03 12:32:42 +03:00
p_token = m_doc [ len ( en_docs [ 0 ] ) - 1 ]
assert p_token . text == " . " and not bool ( p_token . whitespace_ )
en_docs_tokens = [ t for doc in en_docs for t in doc ]
assert len ( m_doc ) == len ( en_docs_tokens )
2020-09-03 11:09:03 +03:00
think_idx = len ( en_texts [ 0 ] ) + 0 + en_texts [ 2 ] . index ( " think " )
2020-07-03 12:32:42 +03:00
assert m_doc [ 9 ] . idx == think_idx
2021-03-29 14:34:01 +03:00
assert " group " in m_doc . spans
assert span_group_texts == sorted ( [ s . text for s in m_doc . spans [ " group " ] ] )
2021-06-23 16:51:35 +03:00
assert bool ( m_doc [ 11 ] . whitespace_ )
2020-07-03 12:32:42 +03:00
2020-07-04 17:25:34 +03:00
m_doc = Doc . from_docs ( en_docs , attrs = [ " lemma " , " length " , " pos " ] )
2021-06-23 16:51:35 +03:00
assert len ( m_doc . text ) > len ( en_texts [ 0 ] ) + len ( en_texts [ 1 ] )
2020-07-04 17:25:34 +03:00
# space delimiter considered, although spacy attribute was missing
2021-06-23 16:51:35 +03:00
assert m_doc . text == " " . join ( [ t . strip ( ) for t in en_texts_without_empty ] )
2020-07-03 12:32:42 +03:00
p_token = m_doc [ len ( en_docs [ 0 ] ) - 1 ]
assert p_token . text == " . " and bool ( p_token . whitespace_ )
en_docs_tokens = [ t for doc in en_docs for t in doc ]
assert len ( m_doc ) == len ( en_docs_tokens )
2020-09-03 11:09:03 +03:00
think_idx = len ( en_texts [ 0 ] ) + 1 + en_texts [ 2 ] . index ( " think " )
2020-07-03 12:32:42 +03:00
assert m_doc [ 9 ] . idx == think_idx
2021-03-29 14:34:01 +03:00
assert " group " in m_doc . spans
assert span_group_texts == sorted ( [ s . text for s in m_doc . spans [ " group " ] ] )
2020-07-03 12:32:42 +03:00
2022-04-25 19:19:03 +03:00
# can exclude spans
m_doc = Doc . from_docs ( en_docs , exclude = [ " spans " ] )
assert " group " not in m_doc . spans
# can exclude user_data
m_doc = Doc . from_docs ( en_docs , exclude = [ " user_data " ] )
assert m_doc . user_data == { }
2021-05-05 19:44:14 +03:00
# can merge empty docs
doc = Doc . from_docs ( [ en_tokenizer ( " " ) ] * 10 )
2021-06-23 16:51:35 +03:00
# empty but set spans keys are preserved
en_docs = [ en_tokenizer ( text ) for text in en_texts ]
m_doc = Doc . from_docs ( en_docs )
assert " group " not in m_doc . spans
for doc in en_docs :
doc . spans [ " group " ] = [ ]
m_doc = Doc . from_docs ( en_docs )
assert " group " in m_doc . spans
assert len ( m_doc . spans [ " group " ] ) == 0
2022-04-25 19:19:03 +03:00
# with tensor
ops = get_current_ops ( )
for doc in en_docs :
doc . tensor = ops . asarray ( [ [ len ( t . text ) , 0.0 ] for t in doc ] )
m_doc = Doc . from_docs ( en_docs )
assert_array_equal (
ops . to_numpy ( m_doc . tensor ) ,
ops . to_numpy ( ops . xp . vstack ( [ doc . tensor for doc in en_docs if len ( doc ) ] ) ) ,
)
# can exclude tensor
m_doc = Doc . from_docs ( en_docs , exclude = [ " tensor " ] )
assert m_doc . tensor . shape == ( 0 , )
2020-07-03 12:32:42 +03:00
2020-09-17 01:14:01 +03:00
def test_doc_api_from_docs_ents ( en_tokenizer ) :
texts = [ " Merging the docs is fun. " , " They don ' t think alike. " ]
docs = [ en_tokenizer ( t ) for t in texts ]
docs [ 0 ] . ents = ( )
docs [ 1 ] . ents = ( Span ( docs [ 1 ] , 0 , 1 , label = " foo " ) , )
doc = Doc . from_docs ( docs )
assert len ( doc . ents ) == 1
2019-03-11 16:21:40 +03:00
def test_doc_lang ( en_vocab ) :
doc = Doc ( en_vocab , words = [ " Hello " , " world " ] )
assert doc . lang_ == " en "
assert doc . lang == en_vocab . strings [ " en " ]
2020-07-22 14:42:59 +03:00
assert doc [ 0 ] . lang_ == " en "
assert doc [ 0 ] . lang == en_vocab . strings [ " en " ]
nlp = English ( )
doc = nlp ( " Hello world " )
assert doc . lang_ == " en "
assert doc . lang == en_vocab . strings [ " en " ]
assert doc [ 0 ] . lang_ == " en "
assert doc [ 0 ] . lang == en_vocab . strings [ " en " ]
2020-08-10 17:43:52 +03:00
def test_token_lexeme ( en_vocab ) :
""" Test that tokens expose their lexeme. """
token = Doc ( en_vocab , words = [ " Hello " , " world " ] ) [ 0 ]
assert isinstance ( token . lex , Lexeme )
assert token . lex . text == token . text
assert en_vocab [ token . orth ] == token . lex
2020-09-17 01:14:01 +03:00
def test_has_annotation ( en_vocab ) :
doc = Doc ( en_vocab , words = [ " Hello " , " world " ] )
attrs = ( " TAG " , " POS " , " MORPH " , " LEMMA " , " DEP " , " HEAD " , " ENT_IOB " , " ENT_TYPE " )
for attr in attrs :
assert not doc . has_annotation ( attr )
2022-02-08 10:35:37 +03:00
assert not doc . has_annotation ( attr , require_complete = True )
2020-09-17 01:14:01 +03:00
doc [ 0 ] . tag_ = " A "
doc [ 0 ] . pos_ = " X "
2020-10-01 23:21:46 +03:00
doc [ 0 ] . set_morph ( " Feat=Val " )
2020-09-17 01:14:01 +03:00
doc [ 0 ] . lemma_ = " a "
doc [ 0 ] . dep_ = " dep "
doc [ 0 ] . head = doc [ 1 ]
2020-09-21 16:54:05 +03:00
doc . set_ents ( [ Span ( doc , 0 , 1 , label = " HELLO " ) ] , default = " missing " )
2020-09-17 01:14:01 +03:00
for attr in attrs :
assert doc . has_annotation ( attr )
assert not doc . has_annotation ( attr , require_complete = True )
doc [ 1 ] . tag_ = " A "
doc [ 1 ] . pos_ = " X "
2020-10-01 23:21:46 +03:00
doc [ 1 ] . set_morph ( " " )
2020-09-17 01:14:01 +03:00
doc [ 1 ] . lemma_ = " a "
doc [ 1 ] . dep_ = " dep "
doc . ents = [ Span ( doc , 0 , 2 , label = " HELLO " ) ]
for attr in attrs :
assert doc . has_annotation ( attr )
assert doc . has_annotation ( attr , require_complete = True )
2022-02-08 10:35:37 +03:00
def test_has_annotation_sents ( en_vocab ) :
doc = Doc ( en_vocab , words = [ " Hello " , " beautiful " , " world " ] )
attrs = ( " SENT_START " , " IS_SENT_START " , " IS_SENT_END " )
for attr in attrs :
assert not doc . has_annotation ( attr )
assert not doc . has_annotation ( attr , require_complete = True )
# The first token (index 0) is always assumed to be a sentence start,
# and ignored by the check in doc.has_annotation
doc [ 1 ] . is_sent_start = False
for attr in attrs :
assert doc . has_annotation ( attr )
assert not doc . has_annotation ( attr , require_complete = True )
doc [ 2 ] . is_sent_start = False
for attr in attrs :
assert doc . has_annotation ( attr )
assert doc . has_annotation ( attr , require_complete = True )
2020-09-17 01:14:01 +03:00
def test_is_flags_deprecated ( en_tokenizer ) :
doc = en_tokenizer ( " test " )
with pytest . deprecated_call ( ) :
doc . is_tagged
with pytest . deprecated_call ( ) :
doc . is_parsed
with pytest . deprecated_call ( ) :
doc . is_nered
with pytest . deprecated_call ( ) :
doc . is_sentenced
2020-09-17 22:10:41 +03:00
2020-09-22 14:45:50 +03:00
def test_doc_set_ents ( en_tokenizer ) :
2020-09-21 16:54:05 +03:00
# set ents
2020-09-17 22:10:41 +03:00
doc = en_tokenizer ( " a b c d e " )
2020-09-21 16:54:05 +03:00
doc . set_ents ( [ Span ( doc , 0 , 1 , 10 ) , Span ( doc , 1 , 3 , 11 ) ] )
assert [ t . ent_iob for t in doc ] == [ 3 , 3 , 1 , 2 , 2 ]
assert [ t . ent_type for t in doc ] == [ 10 , 11 , 11 , 0 , 0 ]
# add ents, invalid IOB repaired
doc = en_tokenizer ( " a b c d e " )
doc . set_ents ( [ Span ( doc , 0 , 1 , 10 ) , Span ( doc , 1 , 3 , 11 ) ] )
doc . set_ents ( [ Span ( doc , 0 , 2 , 12 ) ] , default = " unmodified " )
assert [ t . ent_iob for t in doc ] == [ 3 , 1 , 3 , 2 , 2 ]
assert [ t . ent_type for t in doc ] == [ 12 , 12 , 11 , 0 , 0 ]
# missing ents
doc = en_tokenizer ( " a b c d e " )
doc . set_ents ( [ Span ( doc , 0 , 1 , 10 ) , Span ( doc , 1 , 3 , 11 ) ] , missing = [ doc [ 4 : 5 ] ] )
assert [ t . ent_iob for t in doc ] == [ 3 , 3 , 1 , 2 , 0 ]
assert [ t . ent_type for t in doc ] == [ 10 , 11 , 11 , 0 , 0 ]
# outside ents
doc = en_tokenizer ( " a b c d e " )
doc . set_ents (
[ Span ( doc , 0 , 1 , 10 ) , Span ( doc , 1 , 3 , 11 ) ] ,
outside = [ doc [ 4 : 5 ] ] ,
default = " missing " ,
)
assert [ t . ent_iob for t in doc ] == [ 3 , 3 , 1 , 0 , 2 ]
assert [ t . ent_type for t in doc ] == [ 10 , 11 , 11 , 0 , 0 ]
# blocked ents
doc = en_tokenizer ( " a b c d e " )
doc . set_ents ( [ ] , blocked = [ doc [ 1 : 2 ] , doc [ 3 : 5 ] ] , default = " unmodified " )
2020-09-17 22:10:41 +03:00
assert [ t . ent_iob for t in doc ] == [ 0 , 3 , 0 , 3 , 3 ]
assert [ t . ent_type for t in doc ] == [ 0 , 0 , 0 , 0 , 0 ]
assert doc . ents == tuple ( )
2020-09-21 16:54:05 +03:00
# invalid IOB repaired after blocked
2020-09-17 22:10:41 +03:00
doc . ents = [ Span ( doc , 3 , 5 , " ENT " ) ]
assert [ t . ent_iob for t in doc ] == [ 2 , 2 , 2 , 3 , 1 ]
2020-09-21 16:54:05 +03:00
doc . set_ents ( [ ] , blocked = [ doc [ 3 : 4 ] ] , default = " unmodified " )
2020-09-17 22:10:41 +03:00
assert [ t . ent_iob for t in doc ] == [ 2 , 2 , 2 , 3 , 3 ]
2020-09-21 16:54:05 +03:00
# all types
doc = en_tokenizer ( " a b c d e " )
doc . set_ents (
[ Span ( doc , 0 , 1 , 10 ) ] ,
blocked = [ doc [ 1 : 2 ] ] ,
missing = [ doc [ 2 : 3 ] ] ,
outside = [ doc [ 3 : 4 ] ] ,
default = " unmodified " ,
)
assert [ t . ent_iob for t in doc ] == [ 3 , 3 , 0 , 2 , 0 ]
assert [ t . ent_type for t in doc ] == [ 10 , 0 , 0 , 0 , 0 ]
doc = en_tokenizer ( " a b c d e " )
# single span instead of a list
with pytest . raises ( ValueError ) :
doc . set_ents ( [ ] , missing = doc [ 1 : 2 ] )
# invalid default mode
with pytest . raises ( ValueError ) :
doc . set_ents ( [ ] , missing = [ doc [ 1 : 2 ] ] , default = " none " )
# conflicting/overlapping specifications
with pytest . raises ( ValueError ) :
doc . set_ents ( [ ] , missing = [ doc [ 1 : 2 ] ] , outside = [ doc [ 1 : 2 ] ] )
2020-09-22 14:45:50 +03:00
def test_doc_ents_setter ( ) :
2020-09-22 10:15:57 +03:00
""" Test that both strings and integers can be used to set entities in
tuple format via doc . ents . """
words = [ " a " , " b " , " c " , " d " , " e " ]
doc = Doc ( Vocab ( ) , words = words )
doc . ents = [ ( " HELLO " , 0 , 2 ) , ( doc . vocab . strings . add ( " WORLD " ) , 3 , 5 ) ]
assert [ e . label_ for e in doc . ents ] == [ " HELLO " , " WORLD " ]
vocab = Vocab ( )
ents = [ ( " HELLO " , 0 , 2 ) , ( vocab . strings . add ( " WORLD " ) , 3 , 5 ) ]
2020-10-01 17:22:18 +03:00
ents = [ " B-HELLO " , " I-HELLO " , " O " , " B-WORLD " , " I-WORLD " ]
2020-09-22 10:15:57 +03:00
doc = Doc ( vocab , words = words , ents = ents )
2020-09-24 13:36:51 +03:00
assert [ e . label_ for e in doc . ents ] == [ " HELLO " , " WORLD " ]
2020-10-01 17:22:18 +03:00
2020-10-01 23:21:46 +03:00
def test_doc_morph_setter ( en_tokenizer , de_tokenizer ) :
doc1 = en_tokenizer ( " a b " )
doc1b = en_tokenizer ( " c d " )
doc2 = de_tokenizer ( " a b " )
# unset values can be copied
doc1 [ 0 ] . morph = doc1 [ 1 ] . morph
assert doc1 [ 0 ] . morph . key == 0
assert doc1 [ 1 ] . morph . key == 0
# morph values from the same vocab can be copied
doc1 [ 0 ] . set_morph ( " Feat=Val " )
doc1 [ 1 ] . morph = doc1 [ 0 ] . morph
assert doc1 [ 0 ] . morph == doc1 [ 1 ] . morph
# ... also across docs
doc1b [ 0 ] . morph = doc1 [ 0 ] . morph
assert doc1 [ 0 ] . morph == doc1b [ 0 ] . morph
doc2 [ 0 ] . set_morph ( " Feat2=Val2 " )
# the morph value must come from the same vocab
with pytest . raises ( ValueError ) :
doc1 [ 0 ] . morph = doc2 [ 0 ] . morph
2020-10-01 17:22:18 +03:00
def test_doc_init_iob ( ) :
""" Test ents validation/normalization in Doc.__init__ """
words = [ " a " , " b " , " c " , " d " , " e " ]
ents = [ " O " ] * len ( words )
doc = Doc ( Vocab ( ) , words = words , ents = ents )
assert doc . ents == ( )
ents = [ " B-PERSON " , " I-PERSON " , " O " , " I-PERSON " , " I-PERSON " ]
doc = Doc ( Vocab ( ) , words = words , ents = ents )
assert len ( doc . ents ) == 2
ents = [ " B-PERSON " , " I-PERSON " , " O " , " I-PERSON " , " I-GPE " ]
doc = Doc ( Vocab ( ) , words = words , ents = ents )
assert len ( doc . ents ) == 3
# None is missing
ents = [ " B-PERSON " , " I-PERSON " , " O " , None , " I-GPE " ]
doc = Doc ( Vocab ( ) , words = words , ents = ents )
assert len ( doc . ents ) == 2
# empty tag is missing
ents = [ " " , " B-PERSON " , " O " , " B-PERSON " , " I-PERSON " ]
doc = Doc ( Vocab ( ) , words = words , ents = ents )
assert len ( doc . ents ) == 2
# invalid IOB
ents = [ " Q-PERSON " , " I-PERSON " , " O " , " I-PERSON " , " I-GPE " ]
with pytest . raises ( ValueError ) :
doc = Doc ( Vocab ( ) , words = words , ents = ents )
# no dash
ents = [ " OPERSON " , " I-PERSON " , " O " , " I-PERSON " , " I-GPE " ]
with pytest . raises ( ValueError ) :
doc = Doc ( Vocab ( ) , words = words , ents = ents )
# no ent type
ents = [ " O " , " B- " , " O " , " I-PERSON " , " I-GPE " ]
with pytest . raises ( ValueError ) :
doc = Doc ( Vocab ( ) , words = words , ents = ents )
# not strings or None
ents = [ 0 , " B- " , " O " , " I-PERSON " , " I-GPE " ]
with pytest . raises ( ValueError ) :
doc = Doc ( Vocab ( ) , words = words , ents = ents )
2020-10-09 13:10:25 +03:00
2020-10-09 15:42:51 +03:00
def test_doc_set_ents_invalid_spans ( en_tokenizer ) :
2020-10-09 13:10:25 +03:00
doc = en_tokenizer ( " Some text about Colombia and the Czech Republic " )
spans = [ Span ( doc , 3 , 4 , label = " GPE " ) , Span ( doc , 6 , 8 , label = " GPE " ) ]
with doc . retokenize ( ) as retokenizer :
for span in spans :
retokenizer . merge ( span )
2020-10-09 15:42:51 +03:00
with pytest . raises ( IndexError ) :
doc . ents = spans
2021-01-14 09:30:41 +03:00
2021-01-17 14:56:05 +03:00
def test_doc_noun_chunks_not_implemented ( ) :
""" Test that a language without noun_chunk iterator, throws a NotImplementedError """
text = " Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat. "
nlp = MultiLanguage ( )
doc = nlp ( text )
with pytest . raises ( NotImplementedError ) :
2021-01-30 04:52:33 +03:00
_ = list ( doc . noun_chunks ) # noqa: F841
2021-01-17 14:56:05 +03:00
2021-01-14 09:30:41 +03:00
def test_span_groups ( en_tokenizer ) :
doc = en_tokenizer ( " Some text about Colombia and the Czech Republic " )
doc . spans [ " hi " ] = [ Span ( doc , 3 , 4 , label = " bye " ) ]
assert " hi " in doc . spans
assert " bye " not in doc . spans
assert len ( doc . spans [ " hi " ] ) == 1
assert doc . spans [ " hi " ] [ 0 ] . label_ == " bye "
doc . spans [ " hi " ] . append ( doc [ 0 : 3 ] )
assert len ( doc . spans [ " hi " ] ) == 2
assert doc . spans [ " hi " ] [ 1 ] . text == " Some text about "
assert [ span . text for span in doc . spans [ " hi " ] ] == [ " Colombia " , " Some text about " ]
assert not doc . spans [ " hi " ] . has_overlap
doc . ents = [ Span ( doc , 3 , 4 , label = " GPE " ) , Span ( doc , 6 , 8 , label = " GPE " ) ]
doc . spans [ " hi " ] . extend ( doc . ents )
assert len ( doc . spans [ " hi " ] ) == 4
assert [ span . label_ for span in doc . spans [ " hi " ] ] == [ " bye " , " " , " GPE " , " GPE " ]
assert doc . spans [ " hi " ] . has_overlap
del doc . spans [ " hi " ]
assert " hi " not in doc . spans
2021-02-28 04:32:48 +03:00
def test_doc_spans_copy ( en_tokenizer ) :
doc1 = en_tokenizer ( " Some text about Colombia and the Czech Republic " )
assert weakref . ref ( doc1 ) == doc1 . spans . doc_ref
doc2 = doc1 . copy ( )
assert weakref . ref ( doc2 ) == doc2 . spans . doc_ref
2022-05-12 11:06:25 +03:00
def test_doc_spans_setdefault ( en_tokenizer ) :
doc = en_tokenizer ( " Some text about Colombia and the Czech Republic " )
doc . spans . setdefault ( " key1 " )
assert len ( doc . spans [ " key1 " ] ) == 0
doc . spans . setdefault ( " key2 " , default = [ doc [ 0 : 1 ] ] )
assert len ( doc . spans [ " key2 " ] ) == 1
doc . spans . setdefault ( " key3 " , default = SpanGroup ( doc , spans = [ doc [ 0 : 1 ] , doc [ 1 : 2 ] ] ) )
assert len ( doc . spans [ " key3 " ] ) == 2