Make get_sentence_map work with init

When sentences are not available, just treat the whole doc as one
sentence. A reasonable general fallback, but important due to the init
call, where upstream components aren't run.
This commit is contained in:
Paul O'Leary McCann 2021-05-18 19:54:54 +09:00
parent 883c137b26
commit a7d9c8156d
2 changed files with 15 additions and 9 deletions

View File

@ -145,6 +145,7 @@ def span_embeddings_forward(
tokvecs, docs = inputs tokvecs, docs = inputs
#TODO fix this
dim = tokvecs[0].shape[1] dim = tokvecs[0].shape[1]
get_mentions = model.attrs["get_mentions"] get_mentions = model.attrs["get_mentions"]

View File

@ -1,6 +1,6 @@
from thinc.types import Ints2d from thinc.types import Ints2d
from spacy.tokens import Doc from spacy.tokens import Doc
from typing import List, Tuple, Callable from typing import List, Tuple, Callable, Any
from ...util import registry from ...util import registry
# type alias to make writing this less tedious # type alias to make writing this less tedious
@ -109,13 +109,18 @@ def get_predicted_clusters(
def get_sentence_map(doc: Doc): def get_sentence_map(doc: Doc):
"""For the given span, return a list of sentence indexes.""" """For the given span, return a list of sentence indexes."""
si = 0 try:
out = [] si = 0
for sent in doc.sents: out = []
for tok in sent: for sent in doc.sents:
out.append(si) for tok in sent:
si += 1 out.append(si)
return out si += 1
return out
except ValueError:
# If there are no sents then just return dummy values.
# Shouldn't happen in general training, but typical in init.
return [0] * len(doc)
def get_candidate_mentions( def get_candidate_mentions(
@ -144,7 +149,7 @@ def get_candidate_mentions(
@registry.misc("spacy.CorefCandidateGenerator.v0") @registry.misc("spacy.CorefCandidateGenerator.v0")
def create_mention_generator() -> Callable: def create_mention_generator() -> Any:
return get_candidate_mentions return get_candidate_mentions