mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Extend default punct for sentencizer (#4290)
Most of these characters are for languages / writing systems that aren't supported by spacy, but I don't think it causes problems to include them. In the UD evals, Hindi and Urdu improve a lot as expected (from 0-10% to 70-80%) and Persian improves a little (90% to 96%). Tamil improves in combination with #4288. The punctuation list is converted to a set internally because of its increased length. Sentence final punctuation generated with: ``` unichars -gas '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' '\p{Terminal_Punctuation}' ``` See: https://stackoverflow.com/a/9508766/461847 Fixes #4269.
This commit is contained in:
parent
bee7961927
commit
6942a6a69b
|
@ -1371,7 +1371,16 @@ class Sentencizer(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "sentencizer"
|
name = "sentencizer"
|
||||||
default_punct_chars = [".", "!", "?"]
|
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
||||||
|
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
|
||||||
|
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
|
||||||
|
'‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
|
||||||
|
'꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
|
||||||
|
'﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
|
||||||
|
'𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
|
||||||
|
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
|
||||||
|
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
|
||||||
|
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈']
|
||||||
|
|
||||||
def __init__(self, punct_chars=None, **kwargs):
|
def __init__(self, punct_chars=None, **kwargs):
|
||||||
"""Initialize the sentencizer.
|
"""Initialize the sentencizer.
|
||||||
|
@ -1382,7 +1391,10 @@ class Sentencizer(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#init
|
DOCS: https://spacy.io/api/sentencizer#init
|
||||||
"""
|
"""
|
||||||
self.punct_chars = punct_chars or self.default_punct_chars
|
if punct_chars:
|
||||||
|
self.punct_chars = set(punct_chars)
|
||||||
|
else:
|
||||||
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||||
|
@ -1414,7 +1426,7 @@ class Sentencizer(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#to_bytes
|
DOCS: https://spacy.io/api/sentencizer#to_bytes
|
||||||
"""
|
"""
|
||||||
return srsly.msgpack_dumps({"punct_chars": self.punct_chars})
|
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **kwargs):
|
def from_bytes(self, bytes_data, **kwargs):
|
||||||
"""Load the sentencizer from a bytestring.
|
"""Load the sentencizer from a bytestring.
|
||||||
|
@ -1425,7 +1437,7 @@ class Sentencizer(object):
|
||||||
DOCS: https://spacy.io/api/sentencizer#from_bytes
|
DOCS: https://spacy.io/api/sentencizer#from_bytes
|
||||||
"""
|
"""
|
||||||
cfg = srsly.msgpack_loads(bytes_data)
|
cfg = srsly.msgpack_loads(bytes_data)
|
||||||
self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
|
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
|
@ -1435,7 +1447,7 @@ class Sentencizer(object):
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix(".json")
|
path = path.with_suffix(".json")
|
||||||
srsly.write_json(path, {"punct_chars": self.punct_chars})
|
srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
|
||||||
|
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
|
@ -1446,7 +1458,7 @@ class Sentencizer(object):
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix(".json")
|
path = path.with_suffix(".json")
|
||||||
cfg = srsly.read_json(path)
|
cfg = srsly.read_json(path)
|
||||||
self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
|
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,7 @@ def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_s
|
||||||
def test_sentencizer_serialize_bytes(en_vocab):
|
def test_sentencizer_serialize_bytes(en_vocab):
|
||||||
punct_chars = [".", "~", "+"]
|
punct_chars = [".", "~", "+"]
|
||||||
sentencizer = Sentencizer(punct_chars=punct_chars)
|
sentencizer = Sentencizer(punct_chars=punct_chars)
|
||||||
assert sentencizer.punct_chars == punct_chars
|
assert sentencizer.punct_chars == set(punct_chars)
|
||||||
bytes_data = sentencizer.to_bytes()
|
bytes_data = sentencizer.to_bytes()
|
||||||
new_sentencizer = Sentencizer().from_bytes(bytes_data)
|
new_sentencizer = Sentencizer().from_bytes(bytes_data)
|
||||||
assert new_sentencizer.punct_chars == punct_chars
|
assert new_sentencizer.punct_chars == set(punct_chars)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user