spaCy/spacy/pipeline/sentencizer.pyx
Adriane Boyd f99d6d5e39
Refactor scoring methods to use registered functions (#8766)
* Add scorer option to components

Add an optional `scorer` parameter to all pipeline components. If a
scoring function is provided, it overrides the default scoring method
for that component.

* Add registered scorers for all components

* Add `scorers` registry
* Move all scoring methods outside of components as independent
  functions and register
* Use the registered scoring methods as defaults in configs and inits

Additional:

* The scoring methods no longer have access to the full component, so
  use settings from `cfg` as default scorer options to handle settings
  such as `labels`, `threshold`, and `positive_label`
* The `attribute_ruler` scoring method no longer has access to the
  patterns, so all scoring methods are called
* Bug fix: `spancat` scoring method is updated to set `allow_overlap` to
  score overlapping spans correctly

* Update Russian lemmatizer to use direct score method

* Check type of cfg in Pipe.score

* Fix check

* Update spacy/pipeline/sentencizer.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Remove validate_examples from scoring functions

* Use Pipe.labels instead of Pipe.cfg["labels"]

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2021-08-10 15:13:39 +02:00

176 lines
6.3 KiB
Cython
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# cython: infer_types=True, profile=True, binding=True
from typing import Optional, List, Callable
import srsly
from ..tokens.doc cimport Doc
from .pipe import Pipe
from .senter import senter_score
from ..language import Language
from ..scorer import Scorer
from .. import util
@Language.factory(
"sentencizer",
assigns=["token.is_sent_start", "doc.sents"],
default_config={"punct_chars": None, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_sentencizer(
nlp: Language,
name: str,
punct_chars: Optional[List[str]],
scorer: Optional[Callable],
):
return Sentencizer(name, punct_chars=punct_chars, scorer=scorer)
class Sentencizer(Pipe):
"""Segment the Doc into sentences using a rule-based strategy.
DOCS: https://spacy.io/api/sentencizer
"""
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
'', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '᱿',
'', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
'𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
'', '']
def __init__(
self,
name="sentencizer",
*,
punct_chars=None,
scorer=senter_score,
):
"""Initialize the sentencizer.
punct_chars (list): Punctuation characters to split on. Will be
serialized with the nlp object.
RETURNS (Sentencizer): The sentencizer component.
DOCS: https://spacy.io/api/sentencizer#init
"""
self.name = name
if punct_chars:
self.punct_chars = set(punct_chars)
else:
self.punct_chars = set(self.default_punct_chars)
self.scorer = scorer
def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
doc (Doc): The document to process.
RETURNS (Doc): The processed Doc.
DOCS: https://spacy.io/api/sentencizer#call
"""
error_handler = self.get_error_handler()
try:
tags = self.predict([doc])
self.set_annotations([doc], tags)
return doc
except Exception as e:
error_handler(self.name, self, [doc], e)
def predict(self, docs):
"""Apply the pipe to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict.
RETURNS: The predictions for each document.
"""
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
guesses = [[] for doc in docs]
return guesses
guesses = []
for doc in docs:
doc_guesses = [False] * len(doc)
if len(doc) > 0:
start = 0
seen_period = False
doc_guesses[0] = True
for i, token in enumerate(doc):
is_in_punct_chars = token.text in self.punct_chars
if seen_period and not token.is_punct and not is_in_punct_chars:
doc_guesses[start] = True
start = token.i
seen_period = False
elif is_in_punct_chars:
seen_period = True
if start < len(doc):
doc_guesses[start] = True
guesses.append(doc_guesses)
return guesses
def set_annotations(self, docs, batch_tag_ids):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
scores: The tag IDs produced by Sentencizer.predict.
"""
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef int idx = 0
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber existing sentence boundaries
if doc.c[j].sent_start == 0:
if tag_id:
doc.c[j].sent_start = 1
else:
doc.c[j].sent_start = -1
def to_bytes(self, *, exclude=tuple()):
"""Serialize the sentencizer to a bytestring.
RETURNS (bytes): The serialized object.
DOCS: https://spacy.io/api/sentencizer#to_bytes
"""
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load the sentencizer from a bytestring.
bytes_data (bytes): The data to load.
returns (Sentencizer): The loaded object.
DOCS: https://spacy.io/api/sentencizer#from_bytes
"""
cfg = srsly.msgpack_loads(bytes_data)
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
return self
def to_disk(self, path, *, exclude=tuple()):
"""Serialize the sentencizer to disk.
DOCS: https://spacy.io/api/sentencizer#to_disk
"""
path = util.ensure_path(path)
path = path.with_suffix(".json")
srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
def from_disk(self, path, *, exclude=tuple()):
"""Load the sentencizer from disk.
DOCS: https://spacy.io/api/sentencizer#from_disk
"""
path = util.ensure_path(path)
path = path.with_suffix(".json")
cfg = srsly.read_json(path)
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
return self