mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'master' into develop
This commit is contained in:
		
						commit
						16c2522791
					
				| 
						 | 
					@ -11,6 +11,12 @@ _hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_hindi = r"\u0900-\u097F"
 | 
					_hindi = r"\u0900-\u097F"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_kannada = r"\u0C80-\u0CFF"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_tamil = r"\u0B80-\u0BFF"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_telugu = r"\u0C00-\u0C7F"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Latin standard
 | 
					# Latin standard
 | 
				
			||||||
_latin_u_standard = r"A-Z"
 | 
					_latin_u_standard = r"A-Z"
 | 
				
			||||||
_latin_l_standard = r"a-z"
 | 
					_latin_l_standard = r"a-z"
 | 
				
			||||||
| 
						 | 
					@ -195,7 +201,7 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
 | 
				
			||||||
_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
 | 
					_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
 | 
				
			||||||
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
 | 
					_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi
 | 
					_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
 | 
					ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
 | 
				
			||||||
ALPHA_LOWER = group_chars(_lower + _uncased)
 | 
					ALPHA_LOWER = group_chars(_lower + _uncased)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1375,7 +1375,16 @@ class Sentencizer(object):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    name = "sentencizer"
 | 
					    name = "sentencizer"
 | 
				
			||||||
    default_punct_chars = [".", "!", "?"]
 | 
					    default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
 | 
				
			||||||
 | 
					            '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
 | 
				
			||||||
 | 
					            '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
 | 
				
			||||||
 | 
					            '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
 | 
				
			||||||
 | 
					            '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
 | 
				
			||||||
 | 
					            '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
 | 
				
			||||||
 | 
					            '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
 | 
				
			||||||
 | 
					            '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
 | 
				
			||||||
 | 
					            '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
 | 
				
			||||||
 | 
					            '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, punct_chars=None, **kwargs):
 | 
					    def __init__(self, punct_chars=None, **kwargs):
 | 
				
			||||||
        """Initialize the sentencizer.
 | 
					        """Initialize the sentencizer.
 | 
				
			||||||
| 
						 | 
					@ -1386,7 +1395,10 @@ class Sentencizer(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/sentencizer#init
 | 
					        DOCS: https://spacy.io/api/sentencizer#init
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        self.punct_chars = punct_chars or self.default_punct_chars
 | 
					        if punct_chars:
 | 
				
			||||||
 | 
					            self.punct_chars = set(punct_chars)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            self.punct_chars = set(self.default_punct_chars)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, doc):
 | 
					    def __call__(self, doc):
 | 
				
			||||||
        """Apply the sentencizer to a Doc and set Token.is_sent_start.
 | 
					        """Apply the sentencizer to a Doc and set Token.is_sent_start.
 | 
				
			||||||
| 
						 | 
					@ -1418,7 +1430,7 @@ class Sentencizer(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/sentencizer#to_bytes
 | 
					        DOCS: https://spacy.io/api/sentencizer#to_bytes
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return srsly.msgpack_dumps({"punct_chars": self.punct_chars})
 | 
					        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, bytes_data, **kwargs):
 | 
					    def from_bytes(self, bytes_data, **kwargs):
 | 
				
			||||||
        """Load the sentencizer from a bytestring.
 | 
					        """Load the sentencizer from a bytestring.
 | 
				
			||||||
| 
						 | 
					@ -1429,7 +1441,7 @@ class Sentencizer(object):
 | 
				
			||||||
        DOCS: https://spacy.io/api/sentencizer#from_bytes
 | 
					        DOCS: https://spacy.io/api/sentencizer#from_bytes
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cfg = srsly.msgpack_loads(bytes_data)
 | 
					        cfg = srsly.msgpack_loads(bytes_data)
 | 
				
			||||||
        self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
 | 
					        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_disk(self, path, exclude=tuple(), **kwargs):
 | 
					    def to_disk(self, path, exclude=tuple(), **kwargs):
 | 
				
			||||||
| 
						 | 
					@ -1439,7 +1451,7 @@ class Sentencizer(object):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        path = util.ensure_path(path)
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
        path = path.with_suffix(".json")
 | 
					        path = path.with_suffix(".json")
 | 
				
			||||||
        srsly.write_json(path, {"punct_chars": self.punct_chars})
 | 
					        srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_disk(self, path, exclude=tuple(), **kwargs):
 | 
					    def from_disk(self, path, exclude=tuple(), **kwargs):
 | 
				
			||||||
| 
						 | 
					@ -1450,7 +1462,7 @@ class Sentencizer(object):
 | 
				
			||||||
        path = util.ensure_path(path)
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
        path = path.with_suffix(".json")
 | 
					        path = path.with_suffix(".json")
 | 
				
			||||||
        cfg = srsly.read_json(path)
 | 
					        cfg = srsly.read_json(path)
 | 
				
			||||||
        self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
 | 
					        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -81,7 +81,7 @@ def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_s
 | 
				
			||||||
def test_sentencizer_serialize_bytes(en_vocab):
 | 
					def test_sentencizer_serialize_bytes(en_vocab):
 | 
				
			||||||
    punct_chars = [".", "~", "+"]
 | 
					    punct_chars = [".", "~", "+"]
 | 
				
			||||||
    sentencizer = Sentencizer(punct_chars=punct_chars)
 | 
					    sentencizer = Sentencizer(punct_chars=punct_chars)
 | 
				
			||||||
    assert sentencizer.punct_chars == punct_chars
 | 
					    assert sentencizer.punct_chars == set(punct_chars)
 | 
				
			||||||
    bytes_data = sentencizer.to_bytes()
 | 
					    bytes_data = sentencizer.to_bytes()
 | 
				
			||||||
    new_sentencizer = Sentencizer().from_bytes(bytes_data)
 | 
					    new_sentencizer = Sentencizer().from_bytes(bytes_data)
 | 
				
			||||||
    assert new_sentencizer.punct_chars == punct_chars
 | 
					    assert new_sentencizer.punct_chars == set(punct_chars)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -107,7 +107,7 @@ process.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Infobox>
 | 
					<Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
**Usage:** [Models directory](/models) [Benchmarks](#benchmarks)
 | 
					**Usage:** [Models directory](/models)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
</Infobox>
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,10 +10,7 @@
 | 
				
			||||||
    "modelsRepo": "explosion/spacy-models",
 | 
					    "modelsRepo": "explosion/spacy-models",
 | 
				
			||||||
    "social": {
 | 
					    "social": {
 | 
				
			||||||
        "twitter": "spacy_io",
 | 
					        "twitter": "spacy_io",
 | 
				
			||||||
        "github": "explosion",
 | 
					        "github": "explosion"
 | 
				
			||||||
        "reddit": "spacynlp",
 | 
					 | 
				
			||||||
        "codepen": "explosion",
 | 
					 | 
				
			||||||
        "gitter": "explosion/spaCy"
 | 
					 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
    "theme": "#09a3d5",
 | 
					    "theme": "#09a3d5",
 | 
				
			||||||
    "analytics": "UA-58931649-1",
 | 
					    "analytics": "UA-58931649-1",
 | 
				
			||||||
| 
						 | 
					@ -69,6 +66,7 @@
 | 
				
			||||||
            "items": [
 | 
					            "items": [
 | 
				
			||||||
                { "text": "Twitter", "url": "https://twitter.com/spacy_io" },
 | 
					                { "text": "Twitter", "url": "https://twitter.com/spacy_io" },
 | 
				
			||||||
                { "text": "GitHub", "url": "https://github.com/explosion/spaCy" },
 | 
					                { "text": "GitHub", "url": "https://github.com/explosion/spaCy" },
 | 
				
			||||||
 | 
					                { "text": "YouTube", "url": "https://youtube.com/c/ExplosionAI" },
 | 
				
			||||||
                { "text": "Blog", "url": "https://explosion.ai/blog" }
 | 
					                { "text": "Blog", "url": "https://explosion.ai/blog" }
 | 
				
			||||||
            ]
 | 
					            ]
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user