Merge branch 'master' into develop

This commit is contained in:
Ines Montani 2019-09-14 16:42:01 +02:00
commit 16c2522791
5 changed files with 30 additions and 14 deletions

View File

@ -11,6 +11,12 @@ _hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F"
_hindi = r"\u0900-\u097F" _hindi = r"\u0900-\u097F"
_kannada = r"\u0C80-\u0CFF"
_tamil = r"\u0B80-\u0BFF"
_telugu = r"\u0C00-\u0C7F"
# Latin standard # Latin standard
_latin_u_standard = r"A-Z" _latin_u_standard = r"A-Z"
_latin_l_standard = r"a-z" _latin_l_standard = r"a-z"
@ -195,7 +201,7 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi _uncased = _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
ALPHA_LOWER = group_chars(_lower + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased)

View File

@ -1375,7 +1375,16 @@ class Sentencizer(object):
""" """
name = "sentencizer" name = "sentencizer"
default_punct_chars = [".", "!", "?"] default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
'', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '᱿',
'', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
'𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈']
def __init__(self, punct_chars=None, **kwargs): def __init__(self, punct_chars=None, **kwargs):
"""Initialize the sentencizer. """Initialize the sentencizer.
@ -1386,7 +1395,10 @@ class Sentencizer(object):
DOCS: https://spacy.io/api/sentencizer#init DOCS: https://spacy.io/api/sentencizer#init
""" """
self.punct_chars = punct_chars or self.default_punct_chars if punct_chars:
self.punct_chars = set(punct_chars)
else:
self.punct_chars = set(self.default_punct_chars)
def __call__(self, doc): def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start. """Apply the sentencizer to a Doc and set Token.is_sent_start.
@ -1418,7 +1430,7 @@ class Sentencizer(object):
DOCS: https://spacy.io/api/sentencizer#to_bytes DOCS: https://spacy.io/api/sentencizer#to_bytes
""" """
return srsly.msgpack_dumps({"punct_chars": self.punct_chars}) return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
def from_bytes(self, bytes_data, **kwargs): def from_bytes(self, bytes_data, **kwargs):
"""Load the sentencizer from a bytestring. """Load the sentencizer from a bytestring.
@ -1429,7 +1441,7 @@ class Sentencizer(object):
DOCS: https://spacy.io/api/sentencizer#from_bytes DOCS: https://spacy.io/api/sentencizer#from_bytes
""" """
cfg = srsly.msgpack_loads(bytes_data) cfg = srsly.msgpack_loads(bytes_data)
self.punct_chars = cfg.get("punct_chars", self.default_punct_chars) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
return self return self
def to_disk(self, path, exclude=tuple(), **kwargs): def to_disk(self, path, exclude=tuple(), **kwargs):
@ -1439,7 +1451,7 @@ class Sentencizer(object):
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
path = path.with_suffix(".json") path = path.with_suffix(".json")
srsly.write_json(path, {"punct_chars": self.punct_chars}) srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
def from_disk(self, path, exclude=tuple(), **kwargs): def from_disk(self, path, exclude=tuple(), **kwargs):
@ -1450,7 +1462,7 @@ class Sentencizer(object):
path = util.ensure_path(path) path = util.ensure_path(path)
path = path.with_suffix(".json") path = path.with_suffix(".json")
cfg = srsly.read_json(path) cfg = srsly.read_json(path)
self.punct_chars = cfg.get("punct_chars", self.default_punct_chars) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
return self return self

View File

@ -81,7 +81,7 @@ def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_s
def test_sentencizer_serialize_bytes(en_vocab): def test_sentencizer_serialize_bytes(en_vocab):
punct_chars = [".", "~", "+"] punct_chars = [".", "~", "+"]
sentencizer = Sentencizer(punct_chars=punct_chars) sentencizer = Sentencizer(punct_chars=punct_chars)
assert sentencizer.punct_chars == punct_chars assert sentencizer.punct_chars == set(punct_chars)
bytes_data = sentencizer.to_bytes() bytes_data = sentencizer.to_bytes()
new_sentencizer = Sentencizer().from_bytes(bytes_data) new_sentencizer = Sentencizer().from_bytes(bytes_data)
assert new_sentencizer.punct_chars == punct_chars assert new_sentencizer.punct_chars == set(punct_chars)

View File

@ -107,7 +107,7 @@ process.
<Infobox> <Infobox>
**Usage:** [Models directory](/models) [Benchmarks](#benchmarks) **Usage:** [Models directory](/models)
</Infobox> </Infobox>

View File

@ -10,10 +10,7 @@
"modelsRepo": "explosion/spacy-models", "modelsRepo": "explosion/spacy-models",
"social": { "social": {
"twitter": "spacy_io", "twitter": "spacy_io",
"github": "explosion", "github": "explosion"
"reddit": "spacynlp",
"codepen": "explosion",
"gitter": "explosion/spaCy"
}, },
"theme": "#09a3d5", "theme": "#09a3d5",
"analytics": "UA-58931649-1", "analytics": "UA-58931649-1",
@ -69,6 +66,7 @@
"items": [ "items": [
{ "text": "Twitter", "url": "https://twitter.com/spacy_io" }, { "text": "Twitter", "url": "https://twitter.com/spacy_io" },
{ "text": "GitHub", "url": "https://github.com/explosion/spaCy" }, { "text": "GitHub", "url": "https://github.com/explosion/spaCy" },
{ "text": "YouTube", "url": "https://youtube.com/c/ExplosionAI" },
{ "text": "Blog", "url": "https://explosion.ai/blog" } { "text": "Blog", "url": "https://explosion.ai/blog" }
] ]
} }