mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge pull request #5798 from explosion/feature/language-data-config
This commit is contained in:
		
						commit
						cdbd6ba912
					
				| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
[training]
 | 
					[training]
 | 
				
			||||||
 | 
					max_steps = 0
 | 
				
			||||||
patience = 10000
 | 
					patience = 10000
 | 
				
			||||||
eval_frequency = 200
 | 
					eval_frequency = 200
 | 
				
			||||||
dropout = 0.2
 | 
					dropout = 0.2
 | 
				
			||||||
| 
						 | 
					@ -8,13 +9,20 @@ max_epochs = 100
 | 
				
			||||||
orth_variant_level = 0.0
 | 
					orth_variant_level = 0.0
 | 
				
			||||||
gold_preproc = true
 | 
					gold_preproc = true
 | 
				
			||||||
max_length = 0
 | 
					max_length = 0
 | 
				
			||||||
use_gpu = -1
 | 
					 | 
				
			||||||
scores = ["tags_acc", "uas", "las"]
 | 
					scores = ["tags_acc", "uas", "las"]
 | 
				
			||||||
score_weights = {"las": 0.8, "tags_acc": 0.2}
 | 
					score_weights = {"las": 0.8, "tags_acc": 0.2}
 | 
				
			||||||
limit = 0
 | 
					limit = 0
 | 
				
			||||||
seed = 0
 | 
					seed = 0
 | 
				
			||||||
accumulate_gradient = 2
 | 
					accumulate_gradient = 2
 | 
				
			||||||
discard_oversize = false
 | 
					discard_oversize = false
 | 
				
			||||||
 | 
					raw_text = null
 | 
				
			||||||
 | 
					tag_map = null
 | 
				
			||||||
 | 
					morph_rules = null
 | 
				
			||||||
 | 
					base_model = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					eval_batch_size = 128
 | 
				
			||||||
 | 
					use_pytorch_for_gpu_memory = false
 | 
				
			||||||
 | 
					batch_by = "padded"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[training.batch_size]
 | 
					[training.batch_size]
 | 
				
			||||||
@schedules = "compounding.v1"
 | 
					@schedules = "compounding.v1"
 | 
				
			||||||
| 
						 | 
					@ -30,41 +38,48 @@ beta2 = 0.999
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "en"
 | 
					lang = "en"
 | 
				
			||||||
vectors = ${training:vectors}
 | 
					pipeline = ["tok2vec", "tagger", "parser"]
 | 
				
			||||||
 | 
					load_vocab_data = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.tok2vec]
 | 
					[nlp.tokenizer]
 | 
				
			||||||
 | 
					@tokenizers = "spacy.Tokenizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.tok2vec]
 | 
				
			||||||
factory = "tok2vec"
 | 
					factory = "tok2vec"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.tagger]
 | 
					[components.tagger]
 | 
				
			||||||
factory = "tagger"
 | 
					factory = "tagger"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.parser]
 | 
					[components.parser]
 | 
				
			||||||
factory = "parser"
 | 
					factory = "parser"
 | 
				
			||||||
learn_tokens = false
 | 
					learn_tokens = false
 | 
				
			||||||
min_action_freq = 1
 | 
					min_action_freq = 1
 | 
				
			||||||
beam_width = 1
 | 
					 | 
				
			||||||
beam_update_prob = 1.0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.tagger.model]
 | 
					[components.tagger.model]
 | 
				
			||||||
@architectures = "spacy.Tagger.v1"
 | 
					@architectures = "spacy.Tagger.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.tagger.model.tok2vec]
 | 
					[components.tagger.model.tok2vec]
 | 
				
			||||||
@architectures = "spacy.Tok2VecTensors.v1"
 | 
					@architectures = "spacy.Tok2VecTensors.v1"
 | 
				
			||||||
width = ${nlp.pipeline.tok2vec.model:width}
 | 
					width = ${components.tok2vec.model:width}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.parser.model]
 | 
					[components.parser.model]
 | 
				
			||||||
@architectures = "spacy.TransitionBasedParser.v1"
 | 
					@architectures = "spacy.TransitionBasedParser.v1"
 | 
				
			||||||
nr_feature_tokens = 8
 | 
					nr_feature_tokens = 8
 | 
				
			||||||
hidden_width = 64
 | 
					hidden_width = 64
 | 
				
			||||||
maxout_pieces = 3
 | 
					maxout_pieces = 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.parser.model.tok2vec]
 | 
					[components.parser.model.tok2vec]
 | 
				
			||||||
@architectures = "spacy.Tok2VecTensors.v1"
 | 
					@architectures = "spacy.Tok2VecTensors.v1"
 | 
				
			||||||
width = ${nlp.pipeline.tok2vec.model:width}
 | 
					width = ${components.tok2vec.model:width}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.tok2vec.model]
 | 
					[components.tok2vec.model]
 | 
				
			||||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
					@architectures = "spacy.HashEmbedCNN.v1"
 | 
				
			||||||
pretrained_vectors = ${nlp:vectors}
 | 
					pretrained_vectors = ${training:vectors}
 | 
				
			||||||
width = 96
 | 
					width = 96
 | 
				
			||||||
depth = 4
 | 
					depth = 4
 | 
				
			||||||
window_size = 1
 | 
					window_size = 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -104,7 +104,6 @@ exclude =
 | 
				
			||||||
    .git,
 | 
					    .git,
 | 
				
			||||||
    __pycache__,
 | 
					    __pycache__,
 | 
				
			||||||
    _tokenizer_exceptions_list.py,
 | 
					    _tokenizer_exceptions_list.py,
 | 
				
			||||||
    spacy/__init__.py
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[tool:pytest]
 | 
					[tool:pytest]
 | 
				
			||||||
markers =
 | 
					markers =
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,32 +1,50 @@
 | 
				
			||||||
 | 
					from typing import Union, Iterable, Dict, Any
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 | 
					warnings.filterwarnings("ignore", message="numpy.dtype size changed")  # noqa
 | 
				
			||||||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 | 
					warnings.filterwarnings("ignore", message="numpy.ufunc size changed")  # noqa
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# These are imported as part of the API
 | 
					# These are imported as part of the API
 | 
				
			||||||
from thinc.api import prefer_gpu, require_gpu
 | 
					from thinc.api import prefer_gpu, require_gpu  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import pipeline
 | 
					from . import pipeline  # noqa: F401
 | 
				
			||||||
from .cli.info import info
 | 
					from .cli.info import info  # noqa: F401
 | 
				
			||||||
from .glossary import explain
 | 
					from .glossary import explain  # noqa: F401
 | 
				
			||||||
from .about import __version__
 | 
					from .about import __version__  # noqa: F401
 | 
				
			||||||
from .errors import Errors, Warnings
 | 
					from .util import registry  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .errors import Errors
 | 
				
			||||||
 | 
					from .language import Language
 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
from .util import registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
if sys.maxunicode == 65535:
 | 
					if sys.maxunicode == 65535:
 | 
				
			||||||
    raise SystemError(Errors.E130)
 | 
					    raise SystemError(Errors.E130)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
config = registry
 | 
					def load(
 | 
				
			||||||
 | 
					    name: Union[str, Path],
 | 
				
			||||||
 | 
					    disable: Iterable[str] = tuple(),
 | 
				
			||||||
 | 
					    component_cfg: Dict[str, Dict[str, Any]] = util.SimpleFrozenDict(),
 | 
				
			||||||
 | 
					) -> Language:
 | 
				
			||||||
 | 
					    """Load a spaCy model from an installed package or a local path.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    name (str): Package name or model path.
 | 
				
			||||||
 | 
					    disable (Iterable[str]): Names of pipeline components to disable.
 | 
				
			||||||
 | 
					    component_cfg (Dict[str, dict]): Config overrides for pipeline components,
 | 
				
			||||||
 | 
					        keyed by component names.
 | 
				
			||||||
 | 
					    RETURNS (Language): The loaded nlp object.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    return util.load_model(name, disable=disable, component_cfg=component_cfg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load(name, **overrides):
 | 
					def blank(name: str, **overrides) -> Language:
 | 
				
			||||||
    return util.load_model(name, **overrides)
 | 
					    """Create a blank nlp object for a given language code.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    name (str): The language code, e.g. "en".
 | 
				
			||||||
def blank(name, **kwargs):
 | 
					    **overrides: Keyword arguments passed to language subclass on init.
 | 
				
			||||||
 | 
					    RETURNS (Language): The nlp object.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    LangClass = util.get_lang_class(name)
 | 
					    LangClass = util.get_lang_class(name)
 | 
				
			||||||
    return LangClass(**kwargs)
 | 
					    return LangClass(**overrides)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -41,7 +41,6 @@ def init_model_cli(
 | 
				
			||||||
    truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
 | 
					    truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
 | 
				
			||||||
    vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
 | 
					    vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
 | 
				
			||||||
    model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
 | 
					    model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
 | 
				
			||||||
    omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
 | 
					 | 
				
			||||||
    base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
 | 
					    base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
| 
						 | 
					@ -60,7 +59,6 @@ def init_model_cli(
 | 
				
			||||||
        truncate_vectors=truncate_vectors,
 | 
					        truncate_vectors=truncate_vectors,
 | 
				
			||||||
        vectors_name=vectors_name,
 | 
					        vectors_name=vectors_name,
 | 
				
			||||||
        model_name=model_name,
 | 
					        model_name=model_name,
 | 
				
			||||||
        omit_extra_lookups=omit_extra_lookups,
 | 
					 | 
				
			||||||
        base_model=base_model,
 | 
					        base_model=base_model,
 | 
				
			||||||
        silent=False,
 | 
					        silent=False,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
| 
						 | 
					@ -77,7 +75,6 @@ def init_model(
 | 
				
			||||||
    truncate_vectors: int = 0,
 | 
					    truncate_vectors: int = 0,
 | 
				
			||||||
    vectors_name: Optional[str] = None,
 | 
					    vectors_name: Optional[str] = None,
 | 
				
			||||||
    model_name: Optional[str] = None,
 | 
					    model_name: Optional[str] = None,
 | 
				
			||||||
    omit_extra_lookups: bool = False,
 | 
					 | 
				
			||||||
    base_model: Optional[str] = None,
 | 
					    base_model: Optional[str] = None,
 | 
				
			||||||
    silent: bool = True,
 | 
					    silent: bool = True,
 | 
				
			||||||
) -> Language:
 | 
					) -> Language:
 | 
				
			||||||
| 
						 | 
					@ -109,14 +106,6 @@ def init_model(
 | 
				
			||||||
    with msg.loading("Creating model..."):
 | 
					    with msg.loading("Creating model..."):
 | 
				
			||||||
        nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
 | 
					        nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create empty extra lexeme tables so the data from spacy-lookups-data
 | 
					 | 
				
			||||||
    # isn't loaded if these features are accessed
 | 
					 | 
				
			||||||
    if omit_extra_lookups:
 | 
					 | 
				
			||||||
        nlp.vocab.lookups_extra = Lookups()
 | 
					 | 
				
			||||||
        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
 | 
					 | 
				
			||||||
        nlp.vocab.lookups_extra.add_table("lexeme_prob")
 | 
					 | 
				
			||||||
        nlp.vocab.lookups_extra.add_table("lexeme_settings")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    msg.good("Successfully created model")
 | 
					    msg.good("Successfully created model")
 | 
				
			||||||
    if vectors_loc is not None:
 | 
					    if vectors_loc is not None:
 | 
				
			||||||
        add_vectors(
 | 
					        add_vectors(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -120,14 +120,6 @@ def train(
 | 
				
			||||||
        # Load morph rules
 | 
					        # Load morph rules
 | 
				
			||||||
        nlp.vocab.morphology.load_morph_exceptions(morph_rules)
 | 
					        nlp.vocab.morphology.load_morph_exceptions(morph_rules)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create empty extra lexeme tables so the data from spacy-lookups-data
 | 
					 | 
				
			||||||
    # isn't loaded if these features are accessed
 | 
					 | 
				
			||||||
    if config["training"]["omit_extra_lookups"]:
 | 
					 | 
				
			||||||
        nlp.vocab.lookups_extra = Lookups()
 | 
					 | 
				
			||||||
        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
 | 
					 | 
				
			||||||
        nlp.vocab.lookups_extra.add_table("lexeme_prob")
 | 
					 | 
				
			||||||
        nlp.vocab.lookups_extra.add_table("lexeme_settings")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Load a pretrained tok2vec model - cf. CLI command 'pretrain'
 | 
					    # Load a pretrained tok2vec model - cf. CLI command 'pretrain'
 | 
				
			||||||
    if weights_data is not None:
 | 
					    if weights_data is not None:
 | 
				
			||||||
        tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None)
 | 
					        tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,7 @@
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = null
 | 
					lang = null
 | 
				
			||||||
stop_words = []
 | 
					 | 
				
			||||||
lex_attr_getters = {}
 | 
					 | 
				
			||||||
pipeline = []
 | 
					pipeline = []
 | 
				
			||||||
 | 
					load_vocab_data = true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.tokenizer]
 | 
					[nlp.tokenizer]
 | 
				
			||||||
@tokenizers = "spacy.Tokenizer.v1"
 | 
					@tokenizers = "spacy.Tokenizer.v1"
 | 
				
			||||||
| 
						 | 
					@ -10,11 +9,6 @@ pipeline = []
 | 
				
			||||||
[nlp.lemmatizer]
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					@lemmatizers = "spacy.Lemmatizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.writing_system]
 | 
					 | 
				
			||||||
direction = "ltr"
 | 
					 | 
				
			||||||
has_case = true
 | 
					 | 
				
			||||||
has_letters = true
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[components]
 | 
					[components]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Training hyper-parameters and additional features.
 | 
					# Training hyper-parameters and additional features.
 | 
				
			||||||
| 
						 | 
					@ -45,7 +39,6 @@ score_weights = {"tag_acc": 0.2, "dep_las": 0.4, "ents_f": 0.4}
 | 
				
			||||||
# These settings are invalid for the transformer models.
 | 
					# These settings are invalid for the transformer models.
 | 
				
			||||||
init_tok2vec = null
 | 
					init_tok2vec = null
 | 
				
			||||||
discard_oversize = false
 | 
					discard_oversize = false
 | 
				
			||||||
omit_extra_lookups = false
 | 
					 | 
				
			||||||
batch_by = "sequences"
 | 
					batch_by = "sequences"
 | 
				
			||||||
raw_text = null
 | 
					raw_text = null
 | 
				
			||||||
tag_map = null
 | 
					tag_map = null
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -83,7 +83,7 @@ class Warnings:
 | 
				
			||||||
            "doesn't have a normalization table, please ignore this warning. "
 | 
					            "doesn't have a normalization table, please ignore this warning. "
 | 
				
			||||||
            "If this is surprising, make sure you have the spacy-lookups-data "
 | 
					            "If this is surprising, make sure you have the spacy-lookups-data "
 | 
				
			||||||
            "package installed. The languages with lexeme normalization tables "
 | 
					            "package installed. The languages with lexeme normalization tables "
 | 
				
			||||||
            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
 | 
					            "are currently: {langs}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
 | 
					    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
 | 
				
			||||||
| 
						 | 
					@ -434,9 +434,6 @@ class Errors:
 | 
				
			||||||
    E170 = ("Cannot apply transition {name}: invalid for the current state.")
 | 
					    E170 = ("Cannot apply transition {name}: invalid for the current state.")
 | 
				
			||||||
    E171 = ("Matcher.add received invalid on_match callback argument: expected "
 | 
					    E171 = ("Matcher.add received invalid on_match callback argument: expected "
 | 
				
			||||||
            "callable or None, but got: {arg_type}")
 | 
					            "callable or None, but got: {arg_type}")
 | 
				
			||||||
    E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
 | 
					 | 
				
			||||||
            "Lemmatizer, initialize the class directly. See the docs for "
 | 
					 | 
				
			||||||
            "details: https://spacy.io/api/lemmatizer")
 | 
					 | 
				
			||||||
    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
 | 
					    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
 | 
				
			||||||
    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
 | 
					    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
 | 
				
			||||||
    E177 = ("Ill-formed IOB input detected: {tag}")
 | 
					    E177 = ("Ill-formed IOB input detected: {tag}")
 | 
				
			||||||
| 
						 | 
					@ -486,6 +483,7 @@ class Errors:
 | 
				
			||||||
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
					    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
 | 
					    E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.")
 | 
				
			||||||
    E956 = ("Can't find component '{name}' in [components] block in the config. "
 | 
					    E956 = ("Can't find component '{name}' in [components] block in the config. "
 | 
				
			||||||
            "Available components: {opts}")
 | 
					            "Available components: {opts}")
 | 
				
			||||||
    E957 = ("Writing directly to Language.factories isn't needed anymore in "
 | 
					    E957 = ("Writing directly to Language.factories isn't needed anymore in "
 | 
				
			||||||
| 
						 | 
					@ -601,7 +599,7 @@ class Errors:
 | 
				
			||||||
            "the same `Vocab`.")
 | 
					            "the same `Vocab`.")
 | 
				
			||||||
    E1000 = ("No pkuseg model available. Provide a pkuseg model when "
 | 
					    E1000 = ("No pkuseg model available. Provide a pkuseg model when "
 | 
				
			||||||
             "initializing the pipeline:\n"
 | 
					             "initializing the pipeline:\n"
 | 
				
			||||||
             'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
 | 
					             'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
 | 
				
			||||||
             'nlp = Chinese(config=cfg)')
 | 
					             'nlp = Chinese(config=cfg)')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,8 +25,9 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
 | 
				
			||||||
        lower = True
 | 
					        lower = True
 | 
				
			||||||
        if raw is not None:
 | 
					        if raw is not None:
 | 
				
			||||||
            raw = raw.lower()
 | 
					            raw = raw.lower()
 | 
				
			||||||
    ndsv = nlp.Defaults.single_orth_variants
 | 
					    orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
 | 
				
			||||||
    ndpv = nlp.Defaults.paired_orth_variants
 | 
					    ndsv = orth_variants.get("single", [])
 | 
				
			||||||
 | 
					    ndpv = orth_variants.get("paired", [])
 | 
				
			||||||
    words = token_dict.get("words", [])
 | 
					    words = token_dict.get("words", [])
 | 
				
			||||||
    tags = token_dict.get("tags", [])
 | 
					    tags = token_dict.get("tags", [])
 | 
				
			||||||
    # keep unmodified if words or tags are not defined
 | 
					    # keep unmodified if words or tags are not defined
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class AfrikaansDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "af"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.af.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.af.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Afrikaans(Language):
 | 
					class Afrikaans(Language):
 | 
				
			||||||
    lang = "af"
 | 
					    lang = "af"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = AfrikaansDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Afrikaans"]
 | 
					__all__ = ["Afrikaans"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,48 +1,21 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "ar"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.ar.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.writing_system]
 | 
					 | 
				
			||||||
direction = "rtl"
 | 
					 | 
				
			||||||
has_case = false
 | 
					 | 
				
			||||||
has_letters = true
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ar.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ar.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ArabicDefaults(Language.Defaults):
 | 
					class ArabicDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Arabic(Language):
 | 
					class Arabic(Language):
 | 
				
			||||||
    lang = "ar"
 | 
					 | 
				
			||||||
    Defaults = ArabicDefaults
 | 
					    Defaults = ArabicDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    lang = "ar"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Arabic"]
 | 
					__all__ = ["Arabic"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,6 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
| 
						 | 
					@ -6,41 +8,41 @@ _exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Time
 | 
					# Time
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {LEMMA: "قبل الميلاد", ORTH: "ق.م"},
 | 
					    {NORM: "قبل الميلاد", ORTH: "ق.م"},
 | 
				
			||||||
    {LEMMA: "بعد الميلاد", ORTH: "ب. م"},
 | 
					    {NORM: "بعد الميلاد", ORTH: "ب. م"},
 | 
				
			||||||
    {LEMMA: "ميلادي", ORTH: ".م"},
 | 
					    {NORM: "ميلادي", ORTH: ".م"},
 | 
				
			||||||
    {LEMMA: "هجري", ORTH: ".هـ"},
 | 
					    {NORM: "هجري", ORTH: ".هـ"},
 | 
				
			||||||
    {LEMMA: "توفي", ORTH: ".ت"},
 | 
					    {NORM: "توفي", ORTH: ".ت"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Scientific abv.
 | 
					# Scientific abv.
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"},
 | 
					    {NORM: "صلى الله عليه وسلم", ORTH: "صلعم"},
 | 
				
			||||||
    {LEMMA: "الشارح", ORTH: "الشـ"},
 | 
					    {NORM: "الشارح", ORTH: "الشـ"},
 | 
				
			||||||
    {LEMMA: "الظاهر", ORTH: "الظـ"},
 | 
					    {NORM: "الظاهر", ORTH: "الظـ"},
 | 
				
			||||||
    {LEMMA: "أيضًا", ORTH: "أيضـ"},
 | 
					    {NORM: "أيضًا", ORTH: "أيضـ"},
 | 
				
			||||||
    {LEMMA: "إلى آخره", ORTH: "إلخ"},
 | 
					    {NORM: "إلى آخره", ORTH: "إلخ"},
 | 
				
			||||||
    {LEMMA: "انتهى", ORTH: "اهـ"},
 | 
					    {NORM: "انتهى", ORTH: "اهـ"},
 | 
				
			||||||
    {LEMMA: "حدّثنا", ORTH: "ثنا"},
 | 
					    {NORM: "حدّثنا", ORTH: "ثنا"},
 | 
				
			||||||
    {LEMMA: "حدثني", ORTH: "ثنى"},
 | 
					    {NORM: "حدثني", ORTH: "ثنى"},
 | 
				
			||||||
    {LEMMA: "أنبأنا", ORTH: "أنا"},
 | 
					    {NORM: "أنبأنا", ORTH: "أنا"},
 | 
				
			||||||
    {LEMMA: "أخبرنا", ORTH: "نا"},
 | 
					    {NORM: "أخبرنا", ORTH: "نا"},
 | 
				
			||||||
    {LEMMA: "مصدر سابق", ORTH: "م. س"},
 | 
					    {NORM: "مصدر سابق", ORTH: "م. س"},
 | 
				
			||||||
    {LEMMA: "مصدر نفسه", ORTH: "م. ن"},
 | 
					    {NORM: "مصدر نفسه", ORTH: "م. ن"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Other abv.
 | 
					# Other abv.
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {LEMMA: "دكتور", ORTH: "د."},
 | 
					    {NORM: "دكتور", ORTH: "د."},
 | 
				
			||||||
    {LEMMA: "أستاذ دكتور", ORTH: "أ.د"},
 | 
					    {NORM: "أستاذ دكتور", ORTH: "أ.د"},
 | 
				
			||||||
    {LEMMA: "أستاذ", ORTH: "أ."},
 | 
					    {NORM: "أستاذ", ORTH: "أ."},
 | 
				
			||||||
    {LEMMA: "بروفيسور", ORTH: "ب."},
 | 
					    {NORM: "بروفيسور", ORTH: "ب."},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]:
 | 
					for exc_data in [{NORM: "تلفون", ORTH: "ت."}, {NORM: "صندوق بريد", ORTH: "ص.ب"}]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class BulgarianDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "bg"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.bg.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.bg.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Bulgarian(Language):
 | 
					class Bulgarian(Language):
 | 
				
			||||||
    lang = "bg"
 | 
					    lang = "bg"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = BulgarianDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Bulgarian"]
 | 
					__all__ = ["Bulgarian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,44 +1,20 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "bn"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.bn.stop_words"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.bn.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BengaliDefaults(Language.Defaults):
 | 
					class BengaliDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Bengali(Language):
 | 
					class Bengali(Language):
 | 
				
			||||||
    lang = "bn"
 | 
					    lang = "bn"
 | 
				
			||||||
    Defaults = BengaliDefaults
 | 
					    Defaults = BengaliDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Bengali"]
 | 
					__all__ = ["Bengali"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,24 +1,26 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "ডঃ", LEMMA: "ডক্টর"},
 | 
					    {ORTH: "ডঃ", NORM: "ডক্টর"},
 | 
				
			||||||
    {ORTH: "ডাঃ", LEMMA: "ডাক্তার"},
 | 
					    {ORTH: "ডাঃ", NORM: "ডাক্তার"},
 | 
				
			||||||
    {ORTH: "ড.", LEMMA: "ডক্টর"},
 | 
					    {ORTH: "ড.", NORM: "ডক্টর"},
 | 
				
			||||||
    {ORTH: "ডা.", LEMMA: "ডাক্তার"},
 | 
					    {ORTH: "ডা.", NORM: "ডাক্তার"},
 | 
				
			||||||
    {ORTH: "মোঃ", LEMMA: "মোহাম্মদ"},
 | 
					    {ORTH: "মোঃ", NORM: "মোহাম্মদ"},
 | 
				
			||||||
    {ORTH: "মো.", LEMMA: "মোহাম্মদ"},
 | 
					    {ORTH: "মো.", NORM: "মোহাম্মদ"},
 | 
				
			||||||
    {ORTH: "সে.", LEMMA: "সেলসিয়াস"},
 | 
					    {ORTH: "সে.", NORM: "সেলসিয়াস"},
 | 
				
			||||||
    {ORTH: "কি.মি.", LEMMA: "কিলোমিটার"},
 | 
					    {ORTH: "কি.মি.", NORM: "কিলোমিটার"},
 | 
				
			||||||
    {ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
 | 
					    {ORTH: "কি.মি", NORM: "কিলোমিটার"},
 | 
				
			||||||
    {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
 | 
					    {ORTH: "সে.মি.", NORM: "সেন্টিমিটার"},
 | 
				
			||||||
    {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
 | 
					    {ORTH: "সে.মি", NORM: "সেন্টিমিটার"},
 | 
				
			||||||
    {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"},
 | 
					    {ORTH: "মি.লি.", NORM: "মিলিলিটার"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,49 +1,20 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "ca"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.ca.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ca.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ca.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CatalanDefaults(Language.Defaults):
 | 
					class CatalanDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Catalan(Language):
 | 
					class Catalan(Language):
 | 
				
			||||||
    lang = "ca"
 | 
					    lang = "ca"
 | 
				
			||||||
    Defaults = CatalanDefaults
 | 
					    Defaults = CatalanDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Catalan"]
 | 
					__all__ = ["Catalan"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,38 +1,40 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "aprox.", LEMMA: "aproximadament"},
 | 
					    {ORTH: "aprox.", NORM: "aproximadament"},
 | 
				
			||||||
    {ORTH: "pàg.", LEMMA: "pàgina"},
 | 
					    {ORTH: "pàg.", NORM: "pàgina"},
 | 
				
			||||||
    {ORTH: "p.ex.", LEMMA: "per exemple"},
 | 
					    {ORTH: "p.ex.", NORM: "per exemple"},
 | 
				
			||||||
    {ORTH: "gen.", LEMMA: "gener"},
 | 
					    {ORTH: "gen.", NORM: "gener"},
 | 
				
			||||||
    {ORTH: "feb.", LEMMA: "febrer"},
 | 
					    {ORTH: "feb.", NORM: "febrer"},
 | 
				
			||||||
    {ORTH: "abr.", LEMMA: "abril"},
 | 
					    {ORTH: "abr.", NORM: "abril"},
 | 
				
			||||||
    {ORTH: "jul.", LEMMA: "juliol"},
 | 
					    {ORTH: "jul.", NORM: "juliol"},
 | 
				
			||||||
    {ORTH: "set.", LEMMA: "setembre"},
 | 
					    {ORTH: "set.", NORM: "setembre"},
 | 
				
			||||||
    {ORTH: "oct.", LEMMA: "octubre"},
 | 
					    {ORTH: "oct.", NORM: "octubre"},
 | 
				
			||||||
    {ORTH: "nov.", LEMMA: "novembre"},
 | 
					    {ORTH: "nov.", NORM: "novembre"},
 | 
				
			||||||
    {ORTH: "dec.", LEMMA: "desembre"},
 | 
					    {ORTH: "dec.", NORM: "desembre"},
 | 
				
			||||||
    {ORTH: "Dr.", LEMMA: "doctor"},
 | 
					    {ORTH: "Dr.", NORM: "doctor"},
 | 
				
			||||||
    {ORTH: "Sr.", LEMMA: "senyor"},
 | 
					    {ORTH: "Sr.", NORM: "senyor"},
 | 
				
			||||||
    {ORTH: "Sra.", LEMMA: "senyora"},
 | 
					    {ORTH: "Sra.", NORM: "senyora"},
 | 
				
			||||||
    {ORTH: "Srta.", LEMMA: "senyoreta"},
 | 
					    {ORTH: "Srta.", NORM: "senyoreta"},
 | 
				
			||||||
    {ORTH: "núm", LEMMA: "número"},
 | 
					    {ORTH: "núm", NORM: "número"},
 | 
				
			||||||
    {ORTH: "St.", LEMMA: "sant"},
 | 
					    {ORTH: "St.", NORM: "sant"},
 | 
				
			||||||
    {ORTH: "Sta.", LEMMA: "santa"},
 | 
					    {ORTH: "Sta.", NORM: "santa"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Times
 | 
					# Times
 | 
				
			||||||
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]
 | 
					_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for h in range(1, 12 + 1):
 | 
					for h in range(1, 12 + 1):
 | 
				
			||||||
    for period in ["a.m.", "am"]:
 | 
					    for period in ["a.m.", "am"]:
 | 
				
			||||||
        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, NORM: "a.m."}]
 | 
				
			||||||
    for period in ["p.m.", "pm"]:
 | 
					    for period in ["p.m.", "pm"]:
 | 
				
			||||||
        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, NORM: "p.m."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class CzechDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "cs"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.cs.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.cs.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Czech(Language):
 | 
					class Czech(Language):
 | 
				
			||||||
    lang = "cs"
 | 
					    lang = "cs"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = CzechDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Czech"]
 | 
					__all__ = ["Czech"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,50 +1,21 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "da"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.da.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.da.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.da.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DanishDefaults(Language.Defaults):
 | 
					class DanishDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Danish(Language):
 | 
					class Danish(Language):
 | 
				
			||||||
    lang = "da"
 | 
					    lang = "da"
 | 
				
			||||||
    Defaults = DanishDefaults
 | 
					    Defaults = DanishDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Danish"]
 | 
					__all__ = ["Danish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,9 @@
 | 
				
			||||||
Tokenizer Exceptions.
 | 
					Tokenizer Exceptions.
 | 
				
			||||||
Source: https://forkortelse.dk/ and various others.
 | 
					Source: https://forkortelse.dk/ and various others.
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
from ...symbols import ORTH, LEMMA, NORM
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
| 
						 | 
					@ -11,44 +13,44 @@ _exc = {}
 | 
				
			||||||
# (for "torsdag") are left out because they are ambiguous. The same is the case
 | 
					# (for "torsdag") are left out because they are ambiguous. The same is the case
 | 
				
			||||||
# for abbreviations "jul." and "Jul." ("juli").
 | 
					# for abbreviations "jul." and "Jul." ("juli").
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
 | 
					    {ORTH: "Kbh.", NORM: "København"},
 | 
				
			||||||
    {ORTH: "jan.", LEMMA: "januar"},
 | 
					    {ORTH: "jan.", NORM: "januar"},
 | 
				
			||||||
    {ORTH: "febr.", LEMMA: "februar"},
 | 
					    {ORTH: "febr.", NORM: "februar"},
 | 
				
			||||||
    {ORTH: "feb.", LEMMA: "februar"},
 | 
					    {ORTH: "feb.", NORM: "februar"},
 | 
				
			||||||
    {ORTH: "mar.", LEMMA: "marts"},
 | 
					    {ORTH: "mar.", NORM: "marts"},
 | 
				
			||||||
    {ORTH: "apr.", LEMMA: "april"},
 | 
					    {ORTH: "apr.", NORM: "april"},
 | 
				
			||||||
    {ORTH: "jun.", LEMMA: "juni"},
 | 
					    {ORTH: "jun.", NORM: "juni"},
 | 
				
			||||||
    {ORTH: "aug.", LEMMA: "august"},
 | 
					    {ORTH: "aug.", NORM: "august"},
 | 
				
			||||||
    {ORTH: "sept.", LEMMA: "september"},
 | 
					    {ORTH: "sept.", NORM: "september"},
 | 
				
			||||||
    {ORTH: "sep.", LEMMA: "september"},
 | 
					    {ORTH: "sep.", NORM: "september"},
 | 
				
			||||||
    {ORTH: "okt.", LEMMA: "oktober"},
 | 
					    {ORTH: "okt.", NORM: "oktober"},
 | 
				
			||||||
    {ORTH: "nov.", LEMMA: "november"},
 | 
					    {ORTH: "nov.", NORM: "november"},
 | 
				
			||||||
    {ORTH: "dec.", LEMMA: "december"},
 | 
					    {ORTH: "dec.", NORM: "december"},
 | 
				
			||||||
    {ORTH: "man.", LEMMA: "mandag"},
 | 
					    {ORTH: "man.", NORM: "mandag"},
 | 
				
			||||||
    {ORTH: "tirs.", LEMMA: "tirsdag"},
 | 
					    {ORTH: "tirs.", NORM: "tirsdag"},
 | 
				
			||||||
    {ORTH: "ons.", LEMMA: "onsdag"},
 | 
					    {ORTH: "ons.", NORM: "onsdag"},
 | 
				
			||||||
    {ORTH: "tor.", LEMMA: "torsdag"},
 | 
					    {ORTH: "tor.", NORM: "torsdag"},
 | 
				
			||||||
    {ORTH: "tors.", LEMMA: "torsdag"},
 | 
					    {ORTH: "tors.", NORM: "torsdag"},
 | 
				
			||||||
    {ORTH: "fre.", LEMMA: "fredag"},
 | 
					    {ORTH: "fre.", NORM: "fredag"},
 | 
				
			||||||
    {ORTH: "lør.", LEMMA: "lørdag"},
 | 
					    {ORTH: "lør.", NORM: "lørdag"},
 | 
				
			||||||
    {ORTH: "Jan.", LEMMA: "januar"},
 | 
					    {ORTH: "Jan.", NORM: "januar"},
 | 
				
			||||||
    {ORTH: "Febr.", LEMMA: "februar"},
 | 
					    {ORTH: "Febr.", NORM: "februar"},
 | 
				
			||||||
    {ORTH: "Feb.", LEMMA: "februar"},
 | 
					    {ORTH: "Feb.", NORM: "februar"},
 | 
				
			||||||
    {ORTH: "Mar.", LEMMA: "marts"},
 | 
					    {ORTH: "Mar.", NORM: "marts"},
 | 
				
			||||||
    {ORTH: "Apr.", LEMMA: "april"},
 | 
					    {ORTH: "Apr.", NORM: "april"},
 | 
				
			||||||
    {ORTH: "Jun.", LEMMA: "juni"},
 | 
					    {ORTH: "Jun.", NORM: "juni"},
 | 
				
			||||||
    {ORTH: "Aug.", LEMMA: "august"},
 | 
					    {ORTH: "Aug.", NORM: "august"},
 | 
				
			||||||
    {ORTH: "Sept.", LEMMA: "september"},
 | 
					    {ORTH: "Sept.", NORM: "september"},
 | 
				
			||||||
    {ORTH: "Sep.", LEMMA: "september"},
 | 
					    {ORTH: "Sep.", NORM: "september"},
 | 
				
			||||||
    {ORTH: "Okt.", LEMMA: "oktober"},
 | 
					    {ORTH: "Okt.", NORM: "oktober"},
 | 
				
			||||||
    {ORTH: "Nov.", LEMMA: "november"},
 | 
					    {ORTH: "Nov.", NORM: "november"},
 | 
				
			||||||
    {ORTH: "Dec.", LEMMA: "december"},
 | 
					    {ORTH: "Dec.", NORM: "december"},
 | 
				
			||||||
    {ORTH: "Man.", LEMMA: "mandag"},
 | 
					    {ORTH: "Man.", NORM: "mandag"},
 | 
				
			||||||
    {ORTH: "Tirs.", LEMMA: "tirsdag"},
 | 
					    {ORTH: "Tirs.", NORM: "tirsdag"},
 | 
				
			||||||
    {ORTH: "Ons.", LEMMA: "onsdag"},
 | 
					    {ORTH: "Ons.", NORM: "onsdag"},
 | 
				
			||||||
    {ORTH: "Fre.", LEMMA: "fredag"},
 | 
					    {ORTH: "Fre.", NORM: "fredag"},
 | 
				
			||||||
    {ORTH: "Lør.", LEMMA: "lørdag"},
 | 
					    {ORTH: "Lør.", NORM: "lørdag"},
 | 
				
			||||||
    {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"},
 | 
					    {ORTH: "og/eller", NORM: "og/eller"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -548,22 +550,22 @@ for orth in [
 | 
				
			||||||
    _exc[capitalized] = [{ORTH: capitalized}]
 | 
					    _exc[capitalized] = [{ORTH: capitalized}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "s'gu", LEMMA: "s'gu", NORM: "s'gu"},
 | 
					    {ORTH: "s'gu", NORM: "s'gu"},
 | 
				
			||||||
    {ORTH: "S'gu", LEMMA: "s'gu", NORM: "s'gu"},
 | 
					    {ORTH: "S'gu", NORM: "s'gu"},
 | 
				
			||||||
    {ORTH: "sgu'", LEMMA: "s'gu", NORM: "s'gu"},
 | 
					    {ORTH: "sgu'", NORM: "s'gu"},
 | 
				
			||||||
    {ORTH: "Sgu'", LEMMA: "s'gu", NORM: "s'gu"},
 | 
					    {ORTH: "Sgu'", NORM: "s'gu"},
 | 
				
			||||||
    {ORTH: "sku'", LEMMA: "skal", NORM: "skulle"},
 | 
					    {ORTH: "sku'", NORM: "skulle"},
 | 
				
			||||||
    {ORTH: "ku'", LEMMA: "kan", NORM: "kunne"},
 | 
					    {ORTH: "ku'", NORM: "kunne"},
 | 
				
			||||||
    {ORTH: "Ku'", LEMMA: "kan", NORM: "kunne"},
 | 
					    {ORTH: "Ku'", NORM: "kunne"},
 | 
				
			||||||
    {ORTH: "ka'", LEMMA: "kan", NORM: "kan"},
 | 
					    {ORTH: "ka'", NORM: "kan"},
 | 
				
			||||||
    {ORTH: "Ka'", LEMMA: "kan", NORM: "kan"},
 | 
					    {ORTH: "Ka'", NORM: "kan"},
 | 
				
			||||||
    {ORTH: "gi'", LEMMA: "give", NORM: "giv"},
 | 
					    {ORTH: "gi'", NORM: "giv"},
 | 
				
			||||||
    {ORTH: "Gi'", LEMMA: "give", NORM: "giv"},
 | 
					    {ORTH: "Gi'", NORM: "giv"},
 | 
				
			||||||
    {ORTH: "li'", LEMMA: "lide", NORM: "lide"},
 | 
					    {ORTH: "li'", NORM: "lide"},
 | 
				
			||||||
    {ORTH: "ha'", LEMMA: "have", NORM: "have"},
 | 
					    {ORTH: "ha'", NORM: "have"},
 | 
				
			||||||
    {ORTH: "Ha'", LEMMA: "have", NORM: "have"},
 | 
					    {ORTH: "Ha'", NORM: "have"},
 | 
				
			||||||
    {ORTH: "ik'", LEMMA: "ikke", NORM: "ikke"},
 | 
					    {ORTH: "ik'", NORM: "ikke"},
 | 
				
			||||||
    {ORTH: "Ik'", LEMMA: "ikke", NORM: "ikke"},
 | 
					    {ORTH: "Ik'", NORM: "ikke"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -573,7 +575,7 @@ for h in range(1, 31 + 1):
 | 
				
			||||||
    for period in ["."]:
 | 
					    for period in ["."]:
 | 
				
			||||||
        _exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]}
 | 
					_custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
 | 
				
			||||||
_exc.update(_custom_base_exc)
 | 
					_exc.update(_custom_base_exc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,61 +1,22 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "de"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.de.stop_words"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.de.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GermanDefaults(Language.Defaults):
 | 
					class GermanDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
    single_orth_variants = [
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
        {"tags": ["$("], "variants": ["…", "..."]},
 | 
					 | 
				
			||||||
        {"tags": ["$("], "variants": ["-", "—", "–", "--", "---", "——"]},
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    paired_orth_variants = [
 | 
					 | 
				
			||||||
        {
 | 
					 | 
				
			||||||
            "tags": ["$("],
 | 
					 | 
				
			||||||
            "variants": [("'", "'"), (",", "'"), ("‚", "‘"), ("›", "‹"), ("‹", "›")],
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        {
 | 
					 | 
				
			||||||
            "tags": ["$("],
 | 
					 | 
				
			||||||
            "variants": [("``", "''"), ('"', '"'), ("„", "“"), ("»", "«"), ("«", "»")],
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class German(Language):
 | 
					class German(Language):
 | 
				
			||||||
    lang = "de"
 | 
					    lang = "de"
 | 
				
			||||||
    Defaults = GermanDefaults
 | 
					    Defaults = GermanDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["German"]
 | 
					__all__ = ["German"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,39 +1,26 @@
 | 
				
			||||||
 | 
					from typing import Union, Iterator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					from ...symbols import NOUN, PROPN, PRON
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike):
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
 | 
				
			||||||
    """
 | 
					    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
				
			||||||
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    # this iterator extracts spans headed by NOUNs starting from the left-most
 | 
					    # this iterator extracts spans headed by NOUNs starting from the left-most
 | 
				
			||||||
    # syntactic dependent until the NOUN itself for close apposition and
 | 
					    # syntactic dependent until the NOUN itself for close apposition and
 | 
				
			||||||
    # measurement construction, the span is sometimes extended to the right of
 | 
					    # measurement construction, the span is sometimes extended to the right of
 | 
				
			||||||
    # the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
 | 
					    # the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
 | 
				
			||||||
    # and not just "eine Tasse", same for "das Thema Familie".
 | 
					    # and not just "eine Tasse", same for "das Thema Familie".
 | 
				
			||||||
    labels = [
 | 
					    # fmt: off
 | 
				
			||||||
        "sb",
 | 
					    labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
 | 
				
			||||||
        "oa",
 | 
					    # fmt: on
 | 
				
			||||||
        "da",
 | 
					 | 
				
			||||||
        "nk",
 | 
					 | 
				
			||||||
        "mo",
 | 
					 | 
				
			||||||
        "ag",
 | 
					 | 
				
			||||||
        "ROOT",
 | 
					 | 
				
			||||||
        "root",
 | 
					 | 
				
			||||||
        "cj",
 | 
					 | 
				
			||||||
        "pd",
 | 
					 | 
				
			||||||
        "og",
 | 
					 | 
				
			||||||
        "app",
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
					    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not doc.is_parsed:
 | 
					    if not doc.is_parsed:
 | 
				
			||||||
        raise ValueError(Errors.E029)
 | 
					        raise ValueError(Errors.E029)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
    np_deps = set(doc.vocab.strings.add(label) for label in labels)
 | 
					    np_deps = set(doc.vocab.strings.add(label) for label in labels)
 | 
				
			||||||
    close_app = doc.vocab.strings.add("nk")
 | 
					    close_app = doc.vocab.strings.add("nk")
 | 
				
			||||||
 | 
					 | 
				
			||||||
    rbracket = 0
 | 
					    rbracket = 0
 | 
				
			||||||
    for i, word in enumerate(doclike):
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
        if i < rbracket:
 | 
					        if i < rbracket:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,157 +1,135 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {
 | 
					_exc = {
 | 
				
			||||||
    "auf'm": [{ORTH: "auf", LEMMA: "auf"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
 | 
					    "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
 | 
				
			||||||
    "du's": [
 | 
					    "du's": [{ORTH: "du"}, {ORTH: "'s", NORM: "es"}],
 | 
				
			||||||
        {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
 | 
					    "er's": [{ORTH: "er"}, {ORTH: "'s", NORM: "es"}],
 | 
				
			||||||
        {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
 | 
					    "hinter'm": [{ORTH: "hinter"}, {ORTH: "'m", NORM: "dem"}],
 | 
				
			||||||
    ],
 | 
					    "ich's": [{ORTH: "ich"}, {ORTH: "'s", NORM: "es"}],
 | 
				
			||||||
    "er's": [
 | 
					    "ihr's": [{ORTH: "ihr"}, {ORTH: "'s", NORM: "es"}],
 | 
				
			||||||
        {ORTH: "er", LEMMA: PRON_LEMMA, TAG: "PPER"},
 | 
					    "sie's": [{ORTH: "sie"}, {ORTH: "'s", NORM: "es"}],
 | 
				
			||||||
        {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
 | 
					    "unter'm": [{ORTH: "unter"}, {ORTH: "'m", NORM: "dem"}],
 | 
				
			||||||
    ],
 | 
					    "vor'm": [{ORTH: "vor"}, {ORTH: "'m", NORM: "dem"}],
 | 
				
			||||||
    "hinter'm": [
 | 
					    "wir's": [{ORTH: "wir"}, {ORTH: "'s", NORM: "es"}],
 | 
				
			||||||
        {ORTH: "hinter", LEMMA: "hinter"},
 | 
					    "über'm": [{ORTH: "über"}, {ORTH: "'m", NORM: "dem"}],
 | 
				
			||||||
        {ORTH: "'m", LEMMA: "der", NORM: "dem"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "ich's": [
 | 
					 | 
				
			||||||
        {ORTH: "ich", LEMMA: PRON_LEMMA, TAG: "PPER"},
 | 
					 | 
				
			||||||
        {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "ihr's": [
 | 
					 | 
				
			||||||
        {ORTH: "ihr", LEMMA: PRON_LEMMA, TAG: "PPER"},
 | 
					 | 
				
			||||||
        {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "sie's": [
 | 
					 | 
				
			||||||
        {ORTH: "sie", LEMMA: PRON_LEMMA, TAG: "PPER"},
 | 
					 | 
				
			||||||
        {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "unter'm": [
 | 
					 | 
				
			||||||
        {ORTH: "unter", LEMMA: "unter"},
 | 
					 | 
				
			||||||
        {ORTH: "'m", LEMMA: "der", NORM: "dem"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "vor'm": [{ORTH: "vor", LEMMA: "vor"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
 | 
					 | 
				
			||||||
    "wir's": [
 | 
					 | 
				
			||||||
        {ORTH: "wir", LEMMA: PRON_LEMMA, TAG: "PPER"},
 | 
					 | 
				
			||||||
        {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "über'm": [{ORTH: "über", LEMMA: "über"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
 | 
					    {ORTH: "'S", NORM: "'s"},
 | 
				
			||||||
    {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
 | 
					    {ORTH: "'s", NORM: "'s"},
 | 
				
			||||||
    {ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
 | 
					    {ORTH: "S'", NORM: "'s"},
 | 
				
			||||||
    {ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
 | 
					    {ORTH: "s'", NORM: "'s"},
 | 
				
			||||||
    {ORTH: "'n", LEMMA: "ein", NORM: "ein"},
 | 
					    {ORTH: "'n", NORM: "ein"},
 | 
				
			||||||
    {ORTH: "'ne", LEMMA: "eine", NORM: "eine"},
 | 
					    {ORTH: "'ne", NORM: "eine"},
 | 
				
			||||||
    {ORTH: "'nen", LEMMA: "ein", NORM: "einen"},
 | 
					    {ORTH: "'nen", NORM: "einen"},
 | 
				
			||||||
    {ORTH: "'nem", LEMMA: "ein", NORM: "einem"},
 | 
					    {ORTH: "'nem", NORM: "einem"},
 | 
				
			||||||
    {ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"},
 | 
					    {ORTH: "Abb.", NORM: "Abbildung"},
 | 
				
			||||||
    {ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"},
 | 
					    {ORTH: "Abk.", NORM: "Abkürzung"},
 | 
				
			||||||
    {ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"},
 | 
					    {ORTH: "Abt.", NORM: "Abteilung"},
 | 
				
			||||||
    {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
 | 
					    {ORTH: "Apr.", NORM: "April"},
 | 
				
			||||||
    {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
 | 
					    {ORTH: "Aug.", NORM: "August"},
 | 
				
			||||||
    {ORTH: "Bd.", LEMMA: "Band", NORM: "Band"},
 | 
					    {ORTH: "Bd.", NORM: "Band"},
 | 
				
			||||||
    {ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"},
 | 
					    {ORTH: "Betr.", NORM: "Betreff"},
 | 
				
			||||||
    {ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
 | 
					    {ORTH: "Bf.", NORM: "Bahnhof"},
 | 
				
			||||||
    {ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
 | 
					    {ORTH: "Bhf.", NORM: "Bahnhof"},
 | 
				
			||||||
    {ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"},
 | 
					    {ORTH: "Bsp.", NORM: "Beispiel"},
 | 
				
			||||||
    {ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"},
 | 
					    {ORTH: "Dez.", NORM: "Dezember"},
 | 
				
			||||||
    {ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"},
 | 
					    {ORTH: "Di.", NORM: "Dienstag"},
 | 
				
			||||||
    {ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"},
 | 
					    {ORTH: "Do.", NORM: "Donnerstag"},
 | 
				
			||||||
    {ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"},
 | 
					    {ORTH: "Fa.", NORM: "Firma"},
 | 
				
			||||||
    {ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"},
 | 
					    {ORTH: "Fam.", NORM: "Familie"},
 | 
				
			||||||
    {ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"},
 | 
					    {ORTH: "Feb.", NORM: "Februar"},
 | 
				
			||||||
    {ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"},
 | 
					    {ORTH: "Fr.", NORM: "Frau"},
 | 
				
			||||||
    {ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"},
 | 
					    {ORTH: "Frl.", NORM: "Fräulein"},
 | 
				
			||||||
    {ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"},
 | 
					    {ORTH: "Hbf.", NORM: "Hauptbahnhof"},
 | 
				
			||||||
    {ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"},
 | 
					    {ORTH: "Hr.", NORM: "Herr"},
 | 
				
			||||||
    {ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"},
 | 
					    {ORTH: "Hrn.", NORM: "Herrn"},
 | 
				
			||||||
    {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
 | 
					    {ORTH: "Jan.", NORM: "Januar"},
 | 
				
			||||||
    {ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
 | 
					    {ORTH: "Jh.", NORM: "Jahrhundert"},
 | 
				
			||||||
    {ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
 | 
					    {ORTH: "Jhd.", NORM: "Jahrhundert"},
 | 
				
			||||||
    {ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"},
 | 
					    {ORTH: "Jul.", NORM: "Juli"},
 | 
				
			||||||
    {ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"},
 | 
					    {ORTH: "Jun.", NORM: "Juni"},
 | 
				
			||||||
    {ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"},
 | 
					    {ORTH: "Mi.", NORM: "Mittwoch"},
 | 
				
			||||||
    {ORTH: "Mio.", LEMMA: "Million", NORM: "Million"},
 | 
					    {ORTH: "Mio.", NORM: "Million"},
 | 
				
			||||||
    {ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"},
 | 
					    {ORTH: "Mo.", NORM: "Montag"},
 | 
				
			||||||
    {ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"},
 | 
					    {ORTH: "Mrd.", NORM: "Milliarde"},
 | 
				
			||||||
    {ORTH: "Mrz.", LEMMA: "März", NORM: "März"},
 | 
					    {ORTH: "Mrz.", NORM: "März"},
 | 
				
			||||||
    {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"},
 | 
					    {ORTH: "MwSt.", NORM: "Mehrwertsteuer"},
 | 
				
			||||||
    {ORTH: "Mär.", LEMMA: "März", NORM: "März"},
 | 
					    {ORTH: "Mär.", NORM: "März"},
 | 
				
			||||||
    {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
 | 
					    {ORTH: "Nov.", NORM: "November"},
 | 
				
			||||||
    {ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"},
 | 
					    {ORTH: "Nr.", NORM: "Nummer"},
 | 
				
			||||||
    {ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"},
 | 
					    {ORTH: "Okt.", NORM: "Oktober"},
 | 
				
			||||||
    {ORTH: "Orig.", LEMMA: "Original", NORM: "Original"},
 | 
					    {ORTH: "Orig.", NORM: "Original"},
 | 
				
			||||||
    {ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"},
 | 
					    {ORTH: "Pkt.", NORM: "Punkt"},
 | 
				
			||||||
    {ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"},
 | 
					    {ORTH: "Prof.", NORM: "Professor"},
 | 
				
			||||||
    {ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"},
 | 
					    {ORTH: "Red.", NORM: "Redaktion"},
 | 
				
			||||||
    {ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"},
 | 
					    {ORTH: "Sa.", NORM: "Samstag"},
 | 
				
			||||||
    {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
 | 
					    {ORTH: "Sep.", NORM: "September"},
 | 
				
			||||||
    {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
 | 
					    {ORTH: "Sept.", NORM: "September"},
 | 
				
			||||||
    {ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"},
 | 
					    {ORTH: "So.", NORM: "Sonntag"},
 | 
				
			||||||
    {ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"},
 | 
					    {ORTH: "Std.", NORM: "Stunde"},
 | 
				
			||||||
    {ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"},
 | 
					    {ORTH: "Str.", NORM: "Straße"},
 | 
				
			||||||
    {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
 | 
					    {ORTH: "Tel.", NORM: "Telefon"},
 | 
				
			||||||
    {ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"},
 | 
					    {ORTH: "Tsd.", NORM: "Tausend"},
 | 
				
			||||||
    {ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"},
 | 
					    {ORTH: "Univ.", NORM: "Universität"},
 | 
				
			||||||
    {ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"},
 | 
					    {ORTH: "abzgl.", NORM: "abzüglich"},
 | 
				
			||||||
    {ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"},
 | 
					    {ORTH: "allg.", NORM: "allgemein"},
 | 
				
			||||||
    {ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"},
 | 
					    {ORTH: "bspw.", NORM: "beispielsweise"},
 | 
				
			||||||
    {ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"},
 | 
					    {ORTH: "bzgl.", NORM: "bezüglich"},
 | 
				
			||||||
    {ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"},
 | 
					    {ORTH: "bzw.", NORM: "beziehungsweise"},
 | 
				
			||||||
    {ORTH: "d.h.", LEMMA: "das heißt"},
 | 
					    {ORTH: "d.h."},
 | 
				
			||||||
    {ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"},
 | 
					    {ORTH: "dgl.", NORM: "dergleichen"},
 | 
				
			||||||
    {ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"},
 | 
					    {ORTH: "ebd.", NORM: "ebenda"},
 | 
				
			||||||
    {ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"},
 | 
					    {ORTH: "eigtl.", NORM: "eigentlich"},
 | 
				
			||||||
    {ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"},
 | 
					    {ORTH: "engl.", NORM: "englisch"},
 | 
				
			||||||
    {ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"},
 | 
					    {ORTH: "evtl.", NORM: "eventuell"},
 | 
				
			||||||
    {ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"},
 | 
					    {ORTH: "frz.", NORM: "französisch"},
 | 
				
			||||||
    {ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"},
 | 
					    {ORTH: "gegr.", NORM: "gegründet"},
 | 
				
			||||||
    {ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
 | 
					    {ORTH: "ggf.", NORM: "gegebenenfalls"},
 | 
				
			||||||
    {ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
 | 
					    {ORTH: "ggfs.", NORM: "gegebenenfalls"},
 | 
				
			||||||
    {ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"},
 | 
					    {ORTH: "ggü.", NORM: "gegenüber"},
 | 
				
			||||||
    {ORTH: "i.O.", LEMMA: "in Ordnung"},
 | 
					    {ORTH: "i.O."},
 | 
				
			||||||
    {ORTH: "i.d.R.", LEMMA: "in der Regel"},
 | 
					    {ORTH: "i.d.R."},
 | 
				
			||||||
    {ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"},
 | 
					    {ORTH: "incl.", NORM: "inklusive"},
 | 
				
			||||||
    {ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"},
 | 
					    {ORTH: "inkl.", NORM: "inklusive"},
 | 
				
			||||||
    {ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"},
 | 
					    {ORTH: "insb.", NORM: "insbesondere"},
 | 
				
			||||||
    {ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"},
 | 
					    {ORTH: "kath.", NORM: "katholisch"},
 | 
				
			||||||
    {ORTH: "lt.", LEMMA: "laut", NORM: "laut"},
 | 
					    {ORTH: "lt.", NORM: "laut"},
 | 
				
			||||||
    {ORTH: "max.", LEMMA: "maximal", NORM: "maximal"},
 | 
					    {ORTH: "max.", NORM: "maximal"},
 | 
				
			||||||
    {ORTH: "min.", LEMMA: "minimal", NORM: "minimal"},
 | 
					    {ORTH: "min.", NORM: "minimal"},
 | 
				
			||||||
    {ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"},
 | 
					    {ORTH: "mind.", NORM: "mindestens"},
 | 
				
			||||||
    {ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"},
 | 
					    {ORTH: "mtl.", NORM: "monatlich"},
 | 
				
			||||||
    {ORTH: "n.Chr.", LEMMA: "nach Christus"},
 | 
					    {ORTH: "n.Chr."},
 | 
				
			||||||
    {ORTH: "orig.", LEMMA: "original", NORM: "original"},
 | 
					    {ORTH: "orig.", NORM: "original"},
 | 
				
			||||||
    {ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"},
 | 
					    {ORTH: "röm.", NORM: "römisch"},
 | 
				
			||||||
    {ORTH: "s.o.", LEMMA: "siehe oben"},
 | 
					    {ORTH: "s.o."},
 | 
				
			||||||
    {ORTH: "sog.", LEMMA: "so genannt"},
 | 
					    {ORTH: "sog."},
 | 
				
			||||||
    {ORTH: "stellv.", LEMMA: "stellvertretend"},
 | 
					    {ORTH: "stellv."},
 | 
				
			||||||
    {ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"},
 | 
					    {ORTH: "tägl.", NORM: "täglich"},
 | 
				
			||||||
    {ORTH: "u.U.", LEMMA: "unter Umständen"},
 | 
					    {ORTH: "u.U."},
 | 
				
			||||||
    {ORTH: "u.s.w.", LEMMA: "und so weiter"},
 | 
					    {ORTH: "u.s.w."},
 | 
				
			||||||
    {ORTH: "u.v.m.", LEMMA: "und vieles mehr"},
 | 
					    {ORTH: "u.v.m."},
 | 
				
			||||||
    {ORTH: "usf.", LEMMA: "und so fort"},
 | 
					    {ORTH: "usf."},
 | 
				
			||||||
    {ORTH: "usw.", LEMMA: "und so weiter"},
 | 
					    {ORTH: "usw."},
 | 
				
			||||||
    {ORTH: "uvm.", LEMMA: "und vieles mehr"},
 | 
					    {ORTH: "uvm."},
 | 
				
			||||||
    {ORTH: "v.Chr.", LEMMA: "vor Christus"},
 | 
					    {ORTH: "v.Chr."},
 | 
				
			||||||
    {ORTH: "v.a.", LEMMA: "vor allem"},
 | 
					    {ORTH: "v.a."},
 | 
				
			||||||
    {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"},
 | 
					    {ORTH: "v.l.n.r."},
 | 
				
			||||||
    {ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"},
 | 
					    {ORTH: "vgl.", NORM: "vergleiche"},
 | 
				
			||||||
    {ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"},
 | 
					    {ORTH: "vllt.", NORM: "vielleicht"},
 | 
				
			||||||
    {ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"},
 | 
					    {ORTH: "vlt.", NORM: "vielleicht"},
 | 
				
			||||||
    {ORTH: "z.B.", LEMMA: "zum Beispiel"},
 | 
					    {ORTH: "z.B."},
 | 
				
			||||||
    {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"},
 | 
					    {ORTH: "z.Bsp."},
 | 
				
			||||||
    {ORTH: "z.T.", LEMMA: "zum Teil"},
 | 
					    {ORTH: "z.T."},
 | 
				
			||||||
    {ORTH: "z.Z.", LEMMA: "zur Zeit"},
 | 
					    {ORTH: "z.Z."},
 | 
				
			||||||
    {ORTH: "z.Zt.", LEMMA: "zur Zeit"},
 | 
					    {ORTH: "z.Zt."},
 | 
				
			||||||
    {ORTH: "z.b.", LEMMA: "zum Beispiel"},
 | 
					    {ORTH: "z.b."},
 | 
				
			||||||
    {ORTH: "zzgl.", LEMMA: "zuzüglich"},
 | 
					    {ORTH: "zzgl."},
 | 
				
			||||||
    {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"},
 | 
					    {ORTH: "österr.", NORM: "österreichisch"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -254,4 +232,4 @@ for orth in [
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					from typing import Callable
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
| 
						 | 
					@ -7,53 +7,44 @@ from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .lemmatizer import GreekLemmatizer
 | 
					from .lemmatizer import GreekLemmatizer
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...lookups import load_lookups
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "el"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.el.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.lemmatizer]
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
@lemmatizers = "spacy.GreekLemmatizer.v1"
 | 
					@lemmatizers = "spacy.el.GreekLemmatizer"
 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
 | 
					@registry.lemmatizers("spacy.el.GreekLemmatizer")
 | 
				
			||||||
def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
 | 
					def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]:
 | 
				
			||||||
    return GreekLemmatizer(data_paths=data_paths)
 | 
					    tables = ["lemma_index", "lemma_exc", "lemma_rules"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def lemmatizer_factory(nlp: Language) -> GreekLemmatizer:
 | 
				
			||||||
 | 
					        lookups = load_lookups(lang=nlp.lang, tables=tables)
 | 
				
			||||||
 | 
					        return GreekLemmatizer(lookups=lookups)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.el.stop_words")
 | 
					    return lemmatizer_factory
 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.el.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GreekDefaults(Language.Defaults):
 | 
					class GreekDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Greek(Language):
 | 
					class Greek(Language):
 | 
				
			||||||
    lang = "el"
 | 
					    lang = "el"
 | 
				
			||||||
    Defaults = GreekDefaults
 | 
					    Defaults = GreekDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Greek"]
 | 
					__all__ = ["Greek"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,21 +1,20 @@
 | 
				
			||||||
 | 
					from typing import Union, Iterator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					from ...symbols import NOUN, PROPN, PRON
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike):
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
 | 
				
			||||||
    """
 | 
					    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
				
			||||||
    Detect base noun phrases. Works on both Doc and Span.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    # It follows the logic of the noun chunks finder of English language,
 | 
					    # It follows the logic of the noun chunks finder of English language,
 | 
				
			||||||
    # adjusted to some Greek language special characteristics.
 | 
					    # adjusted to some Greek language special characteristics.
 | 
				
			||||||
    # obj tag corrects some DEP tagger mistakes.
 | 
					    # obj tag corrects some DEP tagger mistakes.
 | 
				
			||||||
    # Further improvement of the models will eliminate the need for this tag.
 | 
					    # Further improvement of the models will eliminate the need for this tag.
 | 
				
			||||||
    labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
 | 
					    labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
 | 
				
			||||||
    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
					    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not doc.is_parsed:
 | 
					    if not doc.is_parsed:
 | 
				
			||||||
        raise ValueError(Errors.E029)
 | 
					        raise ValueError(Errors.E029)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
					    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    nmod = doc.vocab.strings.add("nmod")
 | 
					    nmod = doc.vocab.strings.add("nmod")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| 
						 | 
					@ -1,129 +1,128 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA, NORM
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]:
 | 
					for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "από", NORM: "από"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "από"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Αλλ'", "αλλ'"]:
 | 
					for token in ["Αλλ'", "αλλ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "αλλά", NORM: "αλλά"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "αλλά"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["παρ'", "Παρ'", "ΠΑΡ'"]:
 | 
					for token in ["παρ'", "Παρ'", "ΠΑΡ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "παρά", NORM: "παρά"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "παρά"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["καθ'", "Καθ'"]:
 | 
					for token in ["καθ'", "Καθ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "κάθε", NORM: "κάθε"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "κάθε"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["κατ'", "Κατ'"]:
 | 
					for token in ["κατ'", "Κατ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "κατά", NORM: "κατά"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "κατά"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]:
 | 
					for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "είμαι", NORM: "είμαι"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "είμαι"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Επ'", "επ'", "εφ'", "Εφ'"]:
 | 
					for token in ["Επ'", "επ'", "εφ'", "Εφ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "επί", NORM: "επί"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "επί"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Δι'", "δι'"]:
 | 
					for token in ["Δι'", "δι'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "δια", NORM: "δια"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "δια"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]:
 | 
					for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "έχω", NORM: "έχω"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "έχω"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["υπ'", "Υπ'"]:
 | 
					for token in ["υπ'", "Υπ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "υπό", NORM: "υπό"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "υπό"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Μετ'", "ΜΕΤ'", "'μετ"]:
 | 
					for token in ["Μετ'", "ΜΕΤ'", "'μετ"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "μετά", NORM: "μετά"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "μετά"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Μ'", "μ'"]:
 | 
					for token in ["Μ'", "μ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "με", NORM: "με"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "με"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Γι'", "ΓΙ'", "γι'"]:
 | 
					for token in ["Γι'", "ΓΙ'", "γι'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "για", NORM: "για"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "για"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Σ'", "σ'"]:
 | 
					for token in ["Σ'", "σ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "σε", NORM: "σε"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "σε"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Θ'", "θ'"]:
 | 
					for token in ["Θ'", "θ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "θα", NORM: "θα"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "θα"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Ν'", "ν'"]:
 | 
					for token in ["Ν'", "ν'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "να"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Τ'", "τ'"]:
 | 
					for token in ["Τ'", "τ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "να"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["'γω", "'σένα", "'μεις"]:
 | 
					for token in ["'γω", "'σένα", "'μεις"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "εγώ", NORM: "εγώ"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "εγώ"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Τ'", "τ'"]:
 | 
					for token in ["Τ'", "τ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "το", NORM: "το"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "το"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]:
 | 
					for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "φέρνω", NORM: "φέρνω"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "φέρνω"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]:
 | 
					for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "έρχομαι", NORM: "έρχομαι"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "έρχομαι"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]:
 | 
					for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "λέγω", NORM: "λέγω"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "λέγω"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Πάρ'", "πάρ'"]:
 | 
					for token in ["Πάρ'", "πάρ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "παίρνω", NORM: "παίρνω"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "παίρνω"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["μέσ'", "Μέσ'", "μεσ'"]:
 | 
					for token in ["μέσ'", "Μέσ'", "μεσ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "μέσα", NORM: "μέσα"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "μέσα"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["Δέσ'", "Δεσ'", "δεσ'"]:
 | 
					for token in ["Δέσ'", "Δεσ'", "δεσ'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "δένω", NORM: "δένω"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "δένω"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for token in ["'κανε", "Κάν'"]:
 | 
					for token in ["'κανε", "Κάν'"]:
 | 
				
			||||||
    _exc[token] = [{ORTH: token, LEMMA: "κάνω", NORM: "κάνω"}]
 | 
					    _exc[token] = [{ORTH: token, NORM: "κάνω"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_other_exc = {
 | 
					_other_exc = {
 | 
				
			||||||
    "κι": [{ORTH: "κι", LEMMA: "και", NORM: "και"}],
 | 
					    "κι": [{ORTH: "κι", NORM: "και"}],
 | 
				
			||||||
    "Παίξ'": [{ORTH: "Παίξ'", LEMMA: "παίζω", NORM: "παίζω"}],
 | 
					    "Παίξ'": [{ORTH: "Παίξ'", NORM: "παίζω"}],
 | 
				
			||||||
    "Αντ'": [{ORTH: "Αντ'", LEMMA: "αντί", NORM: "αντί"}],
 | 
					    "Αντ'": [{ORTH: "Αντ'", NORM: "αντί"}],
 | 
				
			||||||
    "ολ'": [{ORTH: "ολ'", LEMMA: "όλος", NORM: "όλος"}],
 | 
					    "ολ'": [{ORTH: "ολ'", NORM: "όλος"}],
 | 
				
			||||||
    "ύστερ'": [{ORTH: "ύστερ'", LEMMA: "ύστερα", NORM: "ύστερα"}],
 | 
					    "ύστερ'": [{ORTH: "ύστερ'", NORM: "ύστερα"}],
 | 
				
			||||||
    "'πρεπε": [{ORTH: "'πρεπε", LEMMA: "πρέπει", NORM: "πρέπει"}],
 | 
					    "'πρεπε": [{ORTH: "'πρεπε", NORM: "πρέπει"}],
 | 
				
			||||||
    "Δύσκολ'": [{ORTH: "Δύσκολ'", LEMMA: "δύσκολος", NORM: "δύσκολος"}],
 | 
					    "Δύσκολ'": [{ORTH: "Δύσκολ'", NORM: "δύσκολος"}],
 | 
				
			||||||
    "'θελα": [{ORTH: "'θελα", LEMMA: "θέλω", NORM: "θέλω"}],
 | 
					    "'θελα": [{ORTH: "'θελα", NORM: "θέλω"}],
 | 
				
			||||||
    "'γραφα": [{ORTH: "'γραφα", LEMMA: "γράφω", NORM: "γράφω"}],
 | 
					    "'γραφα": [{ORTH: "'γραφα", NORM: "γράφω"}],
 | 
				
			||||||
    "'παιρνα": [{ORTH: "'παιρνα", LEMMA: "παίρνω", NORM: "παίρνω"}],
 | 
					    "'παιρνα": [{ORTH: "'παιρνα", NORM: "παίρνω"}],
 | 
				
			||||||
    "'δειξε": [{ORTH: "'δειξε", LEMMA: "δείχνω", NORM: "δείχνω"}],
 | 
					    "'δειξε": [{ORTH: "'δειξε", NORM: "δείχνω"}],
 | 
				
			||||||
    "όμουρφ'": [{ORTH: "όμουρφ'", LEMMA: "όμορφος", NORM: "όμορφος"}],
 | 
					    "όμουρφ'": [{ORTH: "όμουρφ'", NORM: "όμορφος"}],
 | 
				
			||||||
    "κ'τσή": [{ORTH: "κ'τσή", LEMMA: "κουτσός", NORM: "κουτσός"}],
 | 
					    "κ'τσή": [{ORTH: "κ'τσή", NORM: "κουτσός"}],
 | 
				
			||||||
    "μηδ'": [{ORTH: "μηδ'", LEMMA: "μήδε", NORM: "μήδε"}],
 | 
					    "μηδ'": [{ORTH: "μηδ'", NORM: "μήδε"}],
 | 
				
			||||||
    "'ξομολογήθηκε": [
 | 
					    "'ξομολογήθηκε": [{ORTH: "'ξομολογήθηκε", NORM: "εξομολογούμαι"}],
 | 
				
			||||||
        {ORTH: "'ξομολογήθηκε", LEMMA: "εξομολογούμαι", NORM: "εξομολογούμαι"}
 | 
					    "'μας": [{ORTH: "'μας", NORM: "εμάς"}],
 | 
				
			||||||
    ],
 | 
					    "'ξερες": [{ORTH: "'ξερες", NORM: "ξέρω"}],
 | 
				
			||||||
    "'μας": [{ORTH: "'μας", LEMMA: "εμάς", NORM: "εμάς"}],
 | 
					    "έφθασ'": [{ORTH: "έφθασ'", NORM: "φθάνω"}],
 | 
				
			||||||
    "'ξερες": [{ORTH: "'ξερες", LEMMA: "ξέρω", NORM: "ξέρω"}],
 | 
					    "εξ'": [{ORTH: "εξ'", NORM: "εκ"}],
 | 
				
			||||||
    "έφθασ'": [{ORTH: "έφθασ'", LEMMA: "φθάνω", NORM: "φθάνω"}],
 | 
					    "δώσ'": [{ORTH: "δώσ'", NORM: "δίνω"}],
 | 
				
			||||||
    "εξ'": [{ORTH: "εξ'", LEMMA: "εκ", NORM: "εκ"}],
 | 
					    "τίποτ'": [{ORTH: "τίποτ'", NORM: "τίποτα"}],
 | 
				
			||||||
    "δώσ'": [{ORTH: "δώσ'", LEMMA: "δίνω", NORM: "δίνω"}],
 | 
					    "Λήξ'": [{ORTH: "Λήξ'", NORM: "λήγω"}],
 | 
				
			||||||
    "τίποτ'": [{ORTH: "τίποτ'", LEMMA: "τίποτα", NORM: "τίποτα"}],
 | 
					    "άσ'": [{ORTH: "άσ'", NORM: "αφήνω"}],
 | 
				
			||||||
    "Λήξ'": [{ORTH: "Λήξ'", LEMMA: "λήγω", NORM: "λήγω"}],
 | 
					    "Στ'": [{ORTH: "Στ'", NORM: "στο"}],
 | 
				
			||||||
    "άσ'": [{ORTH: "άσ'", LEMMA: "αφήνω", NORM: "αφήνω"}],
 | 
					    "Δωσ'": [{ORTH: "Δωσ'", NORM: "δίνω"}],
 | 
				
			||||||
    "Στ'": [{ORTH: "Στ'", LEMMA: "στο", NORM: "στο"}],
 | 
					    "Βάψ'": [{ORTH: "Βάψ'", NORM: "βάφω"}],
 | 
				
			||||||
    "Δωσ'": [{ORTH: "Δωσ'", LEMMA: "δίνω", NORM: "δίνω"}],
 | 
					    "Αλλ'": [{ORTH: "Αλλ'", NORM: "αλλά"}],
 | 
				
			||||||
    "Βάψ'": [{ORTH: "Βάψ'", LEMMA: "βάφω", NORM: "βάφω"}],
 | 
					    "Αμ'": [{ORTH: "Αμ'", NORM: "άμα"}],
 | 
				
			||||||
    "Αλλ'": [{ORTH: "Αλλ'", LEMMA: "αλλά", NORM: "αλλά"}],
 | 
					    "Αγόρασ'": [{ORTH: "Αγόρασ'", NORM: "αγοράζω"}],
 | 
				
			||||||
    "Αμ'": [{ORTH: "Αμ'", LEMMA: "άμα", NORM: "άμα"}],
 | 
					    "'φύγε": [{ORTH: "'φύγε", NORM: "φεύγω"}],
 | 
				
			||||||
    "Αγόρασ'": [{ORTH: "Αγόρασ'", LEMMA: "αγοράζω", NORM: "αγοράζω"}],
 | 
					    "'φερε": [{ORTH: "'φερε", NORM: "φέρνω"}],
 | 
				
			||||||
    "'φύγε": [{ORTH: "'φύγε", LEMMA: "φεύγω", NORM: "φεύγω"}],
 | 
					    "'φαγε": [{ORTH: "'φαγε", NORM: "τρώω"}],
 | 
				
			||||||
    "'φερε": [{ORTH: "'φερε", LEMMA: "φέρνω", NORM: "φέρνω"}],
 | 
					    "'σπαγαν": [{ORTH: "'σπαγαν", NORM: "σπάω"}],
 | 
				
			||||||
    "'φαγε": [{ORTH: "'φαγε", LEMMA: "τρώω", NORM: "τρώω"}],
 | 
					    "'σκασε": [{ORTH: "'σκασε", NORM: "σκάω"}],
 | 
				
			||||||
    "'σπαγαν": [{ORTH: "'σπαγαν", LEMMA: "σπάω", NORM: "σπάω"}],
 | 
					    "'σβηνε": [{ORTH: "'σβηνε", NORM: "σβήνω"}],
 | 
				
			||||||
    "'σκασε": [{ORTH: "'σκασε", LEMMA: "σκάω", NORM: "σκάω"}],
 | 
					    "'ριξε": [{ORTH: "'ριξε", NORM: "ρίχνω"}],
 | 
				
			||||||
    "'σβηνε": [{ORTH: "'σβηνε", LEMMA: "σβήνω", NORM: "σβήνω"}],
 | 
					    "'κλεβε": [{ORTH: "'κλεβε", NORM: "κλέβω"}],
 | 
				
			||||||
    "'ριξε": [{ORTH: "'ριξε", LEMMA: "ρίχνω", NORM: "ρίχνω"}],
 | 
					    "'κει": [{ORTH: "'κει", NORM: "εκεί"}],
 | 
				
			||||||
    "'κλεβε": [{ORTH: "'κλεβε", LEMMA: "κλέβω", NORM: "κλέβω"}],
 | 
					    "'βλεπε": [{ORTH: "'βλεπε", NORM: "βλέπω"}],
 | 
				
			||||||
    "'κει": [{ORTH: "'κει", LEMMA: "εκεί", NORM: "εκεί"}],
 | 
					    "'βγαινε": [{ORTH: "'βγαινε", NORM: "βγαίνω"}],
 | 
				
			||||||
    "'βλεπε": [{ORTH: "'βλεπε", LEMMA: "βλέπω", NORM: "βλέπω"}],
 | 
					 | 
				
			||||||
    "'βγαινε": [{ORTH: "'βγαινε", LEMMA: "βγαίνω", NORM: "βγαίνω"}],
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc.update(_other_exc)
 | 
					_exc.update(_other_exc)
 | 
				
			||||||
| 
						 | 
					@ -133,35 +132,35 @@ for h in range(1, 12 + 1):
 | 
				
			||||||
    for period in ["π.μ.", "πμ"]:
 | 
					    for period in ["π.μ.", "πμ"]:
 | 
				
			||||||
        _exc[f"{h}{period}"] = [
 | 
					        _exc[f"{h}{period}"] = [
 | 
				
			||||||
            {ORTH: f"{h}"},
 | 
					            {ORTH: f"{h}"},
 | 
				
			||||||
            {ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."},
 | 
					            {ORTH: period, NORM: "π.μ."},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for period in ["μ.μ.", "μμ"]:
 | 
					    for period in ["μ.μ.", "μμ"]:
 | 
				
			||||||
        _exc[f"{h}{period}"] = [
 | 
					        _exc[f"{h}{period}"] = [
 | 
				
			||||||
            {ORTH: f"{h}"},
 | 
					            {ORTH: f"{h}"},
 | 
				
			||||||
            {ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."},
 | 
					            {ORTH: period, NORM: "μ.μ."},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "ΑΓΡ.", LEMMA: "Αγροτικός", NORM: "Αγροτικός"},
 | 
					    {ORTH: "ΑΓΡ.", NORM: "Αγροτικός"},
 | 
				
			||||||
    {ORTH: "Αγ. Γρ.", LEMMA: "Αγία Γραφή", NORM: "Αγία Γραφή"},
 | 
					    {ORTH: "Αγ. Γρ.", NORM: "Αγία Γραφή"},
 | 
				
			||||||
    {ORTH: "Αθ.", LEMMA: "Αθανάσιος", NORM: "Αθανάσιος"},
 | 
					    {ORTH: "Αθ.", NORM: "Αθανάσιος"},
 | 
				
			||||||
    {ORTH: "Αλεξ.", LEMMA: "Αλέξανδρος", NORM: "Αλέξανδρος"},
 | 
					    {ORTH: "Αλεξ.", NORM: "Αλέξανδρος"},
 | 
				
			||||||
    {ORTH: "Απρ.", LEMMA: "Απρίλιος", NORM: "Απρίλιος"},
 | 
					    {ORTH: "Απρ.", NORM: "Απρίλιος"},
 | 
				
			||||||
    {ORTH: "Αύγ.", LEMMA: "Αύγουστος", NORM: "Αύγουστος"},
 | 
					    {ORTH: "Αύγ.", NORM: "Αύγουστος"},
 | 
				
			||||||
    {ORTH: "Δεκ.", LEMMA: "Δεκέμβριος", NORM: "Δεκέμβριος"},
 | 
					    {ORTH: "Δεκ.", NORM: "Δεκέμβριος"},
 | 
				
			||||||
    {ORTH: "Δημ.", LEMMA: "Δήμος", NORM: "Δήμος"},
 | 
					    {ORTH: "Δημ.", NORM: "Δήμος"},
 | 
				
			||||||
    {ORTH: "Ιαν.", LEMMA: "Ιανουάριος", NORM: "Ιανουάριος"},
 | 
					    {ORTH: "Ιαν.", NORM: "Ιανουάριος"},
 | 
				
			||||||
    {ORTH: "Ιούλ.", LEMMA: "Ιούλιος", NORM: "Ιούλιος"},
 | 
					    {ORTH: "Ιούλ.", NORM: "Ιούλιος"},
 | 
				
			||||||
    {ORTH: "Ιούν.", LEMMA: "Ιούνιος", NORM: "Ιούνιος"},
 | 
					    {ORTH: "Ιούν.", NORM: "Ιούνιος"},
 | 
				
			||||||
    {ORTH: "Ιωαν.", LEMMA: "Ιωάννης", NORM: "Ιωάννης"},
 | 
					    {ORTH: "Ιωαν.", NORM: "Ιωάννης"},
 | 
				
			||||||
    {ORTH: "Μ. Ασία", LEMMA: "Μικρά Ασία", NORM: "Μικρά Ασία"},
 | 
					    {ORTH: "Μ. Ασία", NORM: "Μικρά Ασία"},
 | 
				
			||||||
    {ORTH: "Μάρτ.", LEMMA: "Μάρτιος", NORM: "Μάρτιος"},
 | 
					    {ORTH: "Μάρτ.", NORM: "Μάρτιος"},
 | 
				
			||||||
    {ORTH: "Μάρτ'", LEMMA: "Μάρτιος", NORM: "Μάρτιος"},
 | 
					    {ORTH: "Μάρτ'", NORM: "Μάρτιος"},
 | 
				
			||||||
    {ORTH: "Νοέμβρ.", LEMMA: "Νοέμβριος", NORM: "Νοέμβριος"},
 | 
					    {ORTH: "Νοέμβρ.", NORM: "Νοέμβριος"},
 | 
				
			||||||
    {ORTH: "Οκτ.", LEMMA: "Οκτώβριος", NORM: "Οκτώβριος"},
 | 
					    {ORTH: "Οκτ.", NORM: "Οκτώβριος"},
 | 
				
			||||||
    {ORTH: "Σεπτ.", LEMMA: "Σεπτέμβριος", NORM: "Σεπτέμβριος"},
 | 
					    {ORTH: "Σεπτ.", NORM: "Σεπτέμβριος"},
 | 
				
			||||||
    {ORTH: "Φεβρ.", LEMMA: "Φεβρουάριος", NORM: "Φεβρουάριος"},
 | 
					    {ORTH: "Φεβρ.", NORM: "Φεβρουάριος"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -392,4 +391,4 @@ for orth in [
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					from typing import Callable
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
| 
						 | 
					@ -7,60 +7,43 @@ from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from .lemmatizer import is_base_form
 | 
					from .lemmatizer import is_base_form
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...lemmatizer import Lemmatizer
 | 
					from ...lemmatizer import Lemmatizer
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					from ...lookups import load_lookups
 | 
				
			||||||
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "en"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.en.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.lemmatizer]
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
@lemmatizers = "spacy.EnglishLemmatizer.v1"
 | 
					@lemmatizers = "spacy.en.EnglishLemmatizer"
 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.en.stop_words")
 | 
					@registry.lemmatizers("spacy.en.EnglishLemmatizer")
 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					def create_lemmatizer() -> Callable[[Language], Lemmatizer]:
 | 
				
			||||||
    return STOP_WORDS
 | 
					    tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def lemmatizer_factory(nlp: Language) -> Lemmatizer:
 | 
				
			||||||
 | 
					        lookups = load_lookups(lang=nlp.lang, tables=tables)
 | 
				
			||||||
 | 
					        return Lemmatizer(lookups=lookups, is_base_form=is_base_form)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.en.lex_attr_getters")
 | 
					    return lemmatizer_factory
 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
 | 
					 | 
				
			||||||
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
 | 
					 | 
				
			||||||
    return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class EnglishDefaults(Language.Defaults):
 | 
					class EnglishDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    single_orth_variants = [
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
        {"tags": ["NFP"], "variants": ["…", "..."]},
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
        {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    paired_orth_variants = [
 | 
					 | 
				
			||||||
        {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
 | 
					 | 
				
			||||||
        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class English(Language):
 | 
					class English(Language):
 | 
				
			||||||
    lang = "en"
 | 
					    lang = "en"
 | 
				
			||||||
    Defaults = EnglishDefaults
 | 
					    Defaults = EnglishDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["English"]
 | 
					__all__ = ["English"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
 | 
					from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 | 
					from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_infixes = (
 | 
					_infixes = (
 | 
				
			||||||
    LIST_ELLIPSES
 | 
					    LIST_ELLIPSES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,27 +1,18 @@
 | 
				
			||||||
 | 
					from typing import Union, Iterator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					from ...symbols import NOUN, PROPN, PRON
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike):
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
 | 
				
			||||||
    """
 | 
					    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
				
			||||||
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
 | 
					    # fmt: off
 | 
				
			||||||
    """
 | 
					    labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
 | 
				
			||||||
    labels = [
 | 
					    # fmt: on
 | 
				
			||||||
        "nsubj",
 | 
					 | 
				
			||||||
        "dobj",
 | 
					 | 
				
			||||||
        "nsubjpass",
 | 
					 | 
				
			||||||
        "pcomp",
 | 
					 | 
				
			||||||
        "pobj",
 | 
					 | 
				
			||||||
        "dative",
 | 
					 | 
				
			||||||
        "appos",
 | 
					 | 
				
			||||||
        "attr",
 | 
					 | 
				
			||||||
        "ROOT",
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
					    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not doc.is_parsed:
 | 
					    if not doc.is_parsed:
 | 
				
			||||||
        raise ValueError(Errors.E029)
 | 
					        raise ValueError(Errors.E029)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
					    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,6 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
| 
						 | 
					@ -26,110 +28,110 @@ _exclude = [
 | 
				
			||||||
for pron in ["i"]:
 | 
					for pron in ["i"]:
 | 
				
			||||||
    for orth in [pron, pron.title()]:
 | 
					    for orth in [pron, pron.title()]:
 | 
				
			||||||
        _exc[orth + "'m"] = [
 | 
					        _exc[orth + "'m"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"},
 | 
					            {ORTH: "'m", NORM: "am"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "m"] = [
 | 
					        _exc[orth + "m"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1},
 | 
					            {ORTH: "m", "tenspect": 1, "number": 1},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'ma"] = [
 | 
					        _exc[orth + "'ma"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "'m", LEMMA: "be", NORM: "am"},
 | 
					            {ORTH: "'m", NORM: "am"},
 | 
				
			||||||
            {ORTH: "a", LEMMA: "going to", NORM: "gonna"},
 | 
					            {ORTH: "a", NORM: "gonna"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "ma"] = [
 | 
					        _exc[orth + "ma"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "m", LEMMA: "be", NORM: "am"},
 | 
					            {ORTH: "m", NORM: "am"},
 | 
				
			||||||
            {ORTH: "a", LEMMA: "going to", NORM: "gonna"},
 | 
					            {ORTH: "a", NORM: "gonna"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
 | 
					for pron in ["i", "you", "he", "she", "it", "we", "they"]:
 | 
				
			||||||
    for orth in [pron, pron.title()]:
 | 
					    for orth in [pron, pron.title()]:
 | 
				
			||||||
        _exc[orth + "'ll"] = [
 | 
					        _exc[orth + "'ll"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
 | 
					            {ORTH: "'ll", NORM: "will"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "ll"] = [
 | 
					        _exc[orth + "ll"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
 | 
					            {ORTH: "ll", NORM: "will"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'ll've"] = [
 | 
					        _exc[orth + "'ll've"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
 | 
					            {ORTH: "'ll", NORM: "will"},
 | 
				
			||||||
            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "'ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "llve"] = [
 | 
					        _exc[orth + "llve"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
 | 
					            {ORTH: "ll", NORM: "will"},
 | 
				
			||||||
            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'d"] = [
 | 
					        _exc[orth + "'d"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "'d", NORM: "'d"},
 | 
					            {ORTH: "'d", NORM: "'d"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "d"] = [
 | 
					        _exc[orth + "d"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "d", NORM: "'d"},
 | 
					            {ORTH: "d", NORM: "'d"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'d've"] = [
 | 
					        _exc[orth + "'d've"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
 | 
					            {ORTH: "'d", NORM: "would"},
 | 
				
			||||||
            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "'ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "dve"] = [
 | 
					        _exc[orth + "dve"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
 | 
					            {ORTH: "d", NORM: "would"},
 | 
				
			||||||
            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for pron in ["i", "you", "we", "they"]:
 | 
					for pron in ["i", "you", "we", "they"]:
 | 
				
			||||||
    for orth in [pron, pron.title()]:
 | 
					    for orth in [pron, pron.title()]:
 | 
				
			||||||
        _exc[orth + "'ve"] = [
 | 
					        _exc[orth + "'ve"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "'ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "ve"] = [
 | 
					        _exc[orth + "ve"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for pron in ["you", "we", "they"]:
 | 
					for pron in ["you", "we", "they"]:
 | 
				
			||||||
    for orth in [pron, pron.title()]:
 | 
					    for orth in [pron, pron.title()]:
 | 
				
			||||||
        _exc[orth + "'re"] = [
 | 
					        _exc[orth + "'re"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "'re", LEMMA: "be", NORM: "are"},
 | 
					            {ORTH: "'re", NORM: "are"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "re"] = [
 | 
					        _exc[orth + "re"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"},
 | 
					            {ORTH: "re", NORM: "are"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for pron in ["he", "she", "it"]:
 | 
					for pron in ["he", "she", "it"]:
 | 
				
			||||||
    for orth in [pron, pron.title()]:
 | 
					    for orth in [pron, pron.title()]:
 | 
				
			||||||
        _exc[orth + "'s"] = [
 | 
					        _exc[orth + "'s"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "'s", NORM: "'s"},
 | 
					            {ORTH: "'s", NORM: "'s"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "s"] = [
 | 
					        _exc[orth + "s"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
 | 
					            {ORTH: orth, NORM: pron},
 | 
				
			||||||
            {ORTH: "s"},
 | 
					            {ORTH: "s"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -151,145 +153,145 @@ for word in [
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    for orth in [word, word.title()]:
 | 
					    for orth in [word, word.title()]:
 | 
				
			||||||
        _exc[orth + "'s"] = [
 | 
					        _exc[orth + "'s"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "'s", NORM: "'s"},
 | 
					            {ORTH: "'s", NORM: "'s"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "s"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "s"}]
 | 
					        _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'ll"] = [
 | 
					        _exc[orth + "'ll"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
 | 
					            {ORTH: "'ll", NORM: "will"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "ll"] = [
 | 
					        _exc[orth + "ll"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
 | 
					            {ORTH: "ll", NORM: "will"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'ll've"] = [
 | 
					        _exc[orth + "'ll've"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
 | 
					            {ORTH: "'ll", NORM: "will"},
 | 
				
			||||||
            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "'ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "llve"] = [
 | 
					        _exc[orth + "llve"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
 | 
					            {ORTH: "ll", NORM: "will"},
 | 
				
			||||||
            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'re"] = [
 | 
					        _exc[orth + "'re"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "'re", LEMMA: "be", NORM: "are"},
 | 
					            {ORTH: "'re", NORM: "are"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "re"] = [
 | 
					        _exc[orth + "re"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "re", LEMMA: "be", NORM: "are"},
 | 
					            {ORTH: "re", NORM: "are"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'ve"] = [
 | 
					        _exc[orth + "'ve"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "'ve", LEMMA: "have", TAG: "VB"},
 | 
					            {ORTH: "'ve"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "ve"] = [
 | 
					        _exc[orth + "ve"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word},
 | 
					            {ORTH: orth},
 | 
				
			||||||
            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'d"] = [
 | 
					        _exc[orth + "'d"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "'d", NORM: "'d"},
 | 
					            {ORTH: "'d", NORM: "'d"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "d"] = [
 | 
					        _exc[orth + "d"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "d", NORM: "'d"},
 | 
					            {ORTH: "d", NORM: "'d"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "'d've"] = [
 | 
					        _exc[orth + "'d've"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
 | 
					            {ORTH: "'d", NORM: "would"},
 | 
				
			||||||
            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "'ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[orth + "dve"] = [
 | 
					        _exc[orth + "dve"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, NORM: word},
 | 
				
			||||||
            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
 | 
					            {ORTH: "d", NORM: "would"},
 | 
				
			||||||
            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Verbs
 | 
					# Verbs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for verb_data in [
 | 
					for verb_data in [
 | 
				
			||||||
    {ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
 | 
					    {ORTH: "ca", NORM: "can"},
 | 
				
			||||||
    {ORTH: "could", NORM: "could", TAG: "MD"},
 | 
					    {ORTH: "could", NORM: "could"},
 | 
				
			||||||
    {ORTH: "do", LEMMA: "do", NORM: "do"},
 | 
					    {ORTH: "do", NORM: "do"},
 | 
				
			||||||
    {ORTH: "does", LEMMA: "do", NORM: "does"},
 | 
					    {ORTH: "does", NORM: "does"},
 | 
				
			||||||
    {ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
 | 
					    {ORTH: "did", NORM: "do"},
 | 
				
			||||||
    {ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
 | 
					    {ORTH: "had", NORM: "have"},
 | 
				
			||||||
    {ORTH: "may", NORM: "may", TAG: "MD"},
 | 
					    {ORTH: "may", NORM: "may"},
 | 
				
			||||||
    {ORTH: "might", NORM: "might", TAG: "MD"},
 | 
					    {ORTH: "might", NORM: "might"},
 | 
				
			||||||
    {ORTH: "must", NORM: "must", TAG: "MD"},
 | 
					    {ORTH: "must", NORM: "must"},
 | 
				
			||||||
    {ORTH: "need", NORM: "need"},
 | 
					    {ORTH: "need", NORM: "need"},
 | 
				
			||||||
    {ORTH: "ought", NORM: "ought", TAG: "MD"},
 | 
					    {ORTH: "ought", NORM: "ought"},
 | 
				
			||||||
    {ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
 | 
					    {ORTH: "sha", NORM: "shall"},
 | 
				
			||||||
    {ORTH: "should", NORM: "should", TAG: "MD"},
 | 
					    {ORTH: "should", NORM: "should"},
 | 
				
			||||||
    {ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
 | 
					    {ORTH: "wo", NORM: "will"},
 | 
				
			||||||
    {ORTH: "would", NORM: "would", TAG: "MD"},
 | 
					    {ORTH: "would", NORM: "would"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    verb_data_tc = dict(verb_data)
 | 
					    verb_data_tc = dict(verb_data)
 | 
				
			||||||
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
 | 
					    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
 | 
				
			||||||
    for data in [verb_data, verb_data_tc]:
 | 
					    for data in [verb_data, verb_data_tc]:
 | 
				
			||||||
        _exc[data[ORTH] + "n't"] = [
 | 
					        _exc[data[ORTH] + "n't"] = [
 | 
				
			||||||
            dict(data),
 | 
					            dict(data),
 | 
				
			||||||
            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
 | 
					            {ORTH: "n't", NORM: "not"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[data[ORTH] + "nt"] = [
 | 
					        _exc[data[ORTH] + "nt"] = [
 | 
				
			||||||
            dict(data),
 | 
					            dict(data),
 | 
				
			||||||
            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
 | 
					            {ORTH: "nt", NORM: "not"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[data[ORTH] + "n't've"] = [
 | 
					        _exc[data[ORTH] + "n't've"] = [
 | 
				
			||||||
            dict(data),
 | 
					            dict(data),
 | 
				
			||||||
            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
 | 
					            {ORTH: "n't", NORM: "not"},
 | 
				
			||||||
            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "'ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[data[ORTH] + "ntve"] = [
 | 
					        _exc[data[ORTH] + "ntve"] = [
 | 
				
			||||||
            dict(data),
 | 
					            dict(data),
 | 
				
			||||||
            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
 | 
					            {ORTH: "nt", NORM: "not"},
 | 
				
			||||||
            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					            {ORTH: "ve", NORM: "have"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for verb_data in [
 | 
					for verb_data in [
 | 
				
			||||||
    {ORTH: "could", NORM: "could", TAG: "MD"},
 | 
					    {ORTH: "could", NORM: "could"},
 | 
				
			||||||
    {ORTH: "might", NORM: "might", TAG: "MD"},
 | 
					    {ORTH: "might", NORM: "might"},
 | 
				
			||||||
    {ORTH: "must", NORM: "must", TAG: "MD"},
 | 
					    {ORTH: "must", NORM: "must"},
 | 
				
			||||||
    {ORTH: "should", NORM: "should", TAG: "MD"},
 | 
					    {ORTH: "should", NORM: "should"},
 | 
				
			||||||
    {ORTH: "would", NORM: "would", TAG: "MD"},
 | 
					    {ORTH: "would", NORM: "would"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    verb_data_tc = dict(verb_data)
 | 
					    verb_data_tc = dict(verb_data)
 | 
				
			||||||
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
 | 
					    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
 | 
				
			||||||
    for data in [verb_data, verb_data_tc]:
 | 
					    for data in [verb_data, verb_data_tc]:
 | 
				
			||||||
        _exc[data[ORTH] + "'ve"] = [dict(data), {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
 | 
					        _exc[data[ORTH] + "'ve"] = [dict(data), {ORTH: "'ve"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[data[ORTH] + "ve"] = [dict(data), {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
 | 
					        _exc[data[ORTH] + "ve"] = [dict(data), {ORTH: "ve"}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for verb_data in [
 | 
					for verb_data in [
 | 
				
			||||||
    {ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
 | 
					    {ORTH: "ai", "number": 2},
 | 
				
			||||||
    {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
 | 
					    {ORTH: "are", NORM: "are", "number": 2},
 | 
				
			||||||
    {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
 | 
					    {ORTH: "is", NORM: "is"},
 | 
				
			||||||
    {ORTH: "was", LEMMA: "be", NORM: "was"},
 | 
					    {ORTH: "was", NORM: "was"},
 | 
				
			||||||
    {ORTH: "were", LEMMA: "be", NORM: "were"},
 | 
					    {ORTH: "were", NORM: "were"},
 | 
				
			||||||
    {ORTH: "have", NORM: "have"},
 | 
					    {ORTH: "have", NORM: "have"},
 | 
				
			||||||
    {ORTH: "has", LEMMA: "have", NORM: "has"},
 | 
					    {ORTH: "has", NORM: "has"},
 | 
				
			||||||
    {ORTH: "dare", NORM: "dare"},
 | 
					    {ORTH: "dare", NORM: "dare"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    verb_data_tc = dict(verb_data)
 | 
					    verb_data_tc = dict(verb_data)
 | 
				
			||||||
| 
						 | 
					@ -297,24 +299,24 @@ for verb_data in [
 | 
				
			||||||
    for data in [verb_data, verb_data_tc]:
 | 
					    for data in [verb_data, verb_data_tc]:
 | 
				
			||||||
        _exc[data[ORTH] + "n't"] = [
 | 
					        _exc[data[ORTH] + "n't"] = [
 | 
				
			||||||
            dict(data),
 | 
					            dict(data),
 | 
				
			||||||
            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
 | 
					            {ORTH: "n't", NORM: "not"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        _exc[data[ORTH] + "nt"] = [
 | 
					        _exc[data[ORTH] + "nt"] = [
 | 
				
			||||||
            dict(data),
 | 
					            dict(data),
 | 
				
			||||||
            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
 | 
					            {ORTH: "nt", NORM: "not"},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Other contractions with trailing apostrophe
 | 
					# Other contractions with trailing apostrophe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "doin", LEMMA: "do", NORM: "doing"},
 | 
					    {ORTH: "doin", NORM: "doing"},
 | 
				
			||||||
    {ORTH: "goin", LEMMA: "go", NORM: "going"},
 | 
					    {ORTH: "goin", NORM: "going"},
 | 
				
			||||||
    {ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
 | 
					    {ORTH: "nothin", NORM: "nothing"},
 | 
				
			||||||
    {ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
 | 
					    {ORTH: "nuthin", NORM: "nothing"},
 | 
				
			||||||
    {ORTH: "ol", LEMMA: "old", NORM: "old"},
 | 
					    {ORTH: "ol", NORM: "old"},
 | 
				
			||||||
    {ORTH: "somethin", LEMMA: "something", NORM: "something"},
 | 
					    {ORTH: "somethin", NORM: "something"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    exc_data_tc = dict(exc_data)
 | 
					    exc_data_tc = dict(exc_data)
 | 
				
			||||||
    exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
 | 
					    exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
 | 
				
			||||||
| 
						 | 
					@ -329,9 +331,9 @@ for exc_data in [
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "cause", NORM: "because"},
 | 
					    {ORTH: "cause", NORM: "because"},
 | 
				
			||||||
    {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
 | 
					    {ORTH: "em", NORM: "them"},
 | 
				
			||||||
    {ORTH: "ll", LEMMA: "will", NORM: "will"},
 | 
					    {ORTH: "ll", NORM: "will"},
 | 
				
			||||||
    {ORTH: "nuff", LEMMA: "enough", NORM: "enough"},
 | 
					    {ORTH: "nuff", NORM: "enough"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    exc_data_apos = dict(exc_data)
 | 
					    exc_data_apos = dict(exc_data)
 | 
				
			||||||
    exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
 | 
					    exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
 | 
				
			||||||
| 
						 | 
					@ -345,166 +347,131 @@ for h in range(1, 12 + 1):
 | 
				
			||||||
    for period in ["a.m.", "am"]:
 | 
					    for period in ["a.m.", "am"]:
 | 
				
			||||||
        _exc[f"{h}{period}"] = [
 | 
					        _exc[f"{h}{period}"] = [
 | 
				
			||||||
            {ORTH: f"{h}"},
 | 
					            {ORTH: f"{h}"},
 | 
				
			||||||
            {ORTH: period, LEMMA: "a.m.", NORM: "a.m."},
 | 
					            {ORTH: period, NORM: "a.m."},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
    for period in ["p.m.", "pm"]:
 | 
					    for period in ["p.m.", "pm"]:
 | 
				
			||||||
        _exc[f"{h}{period}"] = [
 | 
					        _exc[f"{h}{period}"] = [
 | 
				
			||||||
            {ORTH: f"{h}"},
 | 
					            {ORTH: f"{h}"},
 | 
				
			||||||
            {ORTH: period, LEMMA: "p.m.", NORM: "p.m."},
 | 
					            {ORTH: period, NORM: "p.m."},
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Rest
 | 
					# Rest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_other_exc = {
 | 
					_other_exc = {
 | 
				
			||||||
    "y'all": [{ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "all"}],
 | 
					    "y'all": [{ORTH: "y'", NORM: "you"}, {ORTH: "all"}],
 | 
				
			||||||
    "yall": [{ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "all"}],
 | 
					    "yall": [{ORTH: "y", NORM: "you"}, {ORTH: "all"}],
 | 
				
			||||||
    "how'd'y": [
 | 
					    "how'd'y": [{ORTH: "how"}, {ORTH: "'d"}, {ORTH: "'y", NORM: "you"}],
 | 
				
			||||||
        {ORTH: "how", LEMMA: "how"},
 | 
					    "How'd'y": [{ORTH: "How", NORM: "how"}, {ORTH: "'d"}, {ORTH: "'y", NORM: "you"}],
 | 
				
			||||||
        {ORTH: "'d", LEMMA: "do"},
 | 
					    "not've": [{ORTH: "not"}, {ORTH: "'ve", NORM: "have"}],
 | 
				
			||||||
        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"},
 | 
					    "notve": [{ORTH: "not"}, {ORTH: "ve", NORM: "have"}],
 | 
				
			||||||
    ],
 | 
					    "Not've": [{ORTH: "Not", NORM: "not"}, {ORTH: "'ve", NORM: "have"}],
 | 
				
			||||||
    "How'd'y": [
 | 
					    "Notve": [{ORTH: "Not", NORM: "not"}, {ORTH: "ve", NORM: "have"}],
 | 
				
			||||||
        {ORTH: "How", LEMMA: "how", NORM: "how"},
 | 
					    "cannot": [{ORTH: "can"}, {ORTH: "not"}],
 | 
				
			||||||
        {ORTH: "'d", LEMMA: "do"},
 | 
					    "Cannot": [{ORTH: "Can", NORM: "can"}, {ORTH: "not"}],
 | 
				
			||||||
        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"},
 | 
					    "gonna": [{ORTH: "gon", NORM: "going"}, {ORTH: "na", NORM: "to"}],
 | 
				
			||||||
    ],
 | 
					    "Gonna": [{ORTH: "Gon", NORM: "going"}, {ORTH: "na", NORM: "to"}],
 | 
				
			||||||
    "not've": [
 | 
					    "gotta": [{ORTH: "got"}, {ORTH: "ta", NORM: "to"}],
 | 
				
			||||||
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
 | 
					    "Gotta": [{ORTH: "Got", NORM: "got"}, {ORTH: "ta", NORM: "to"}],
 | 
				
			||||||
        {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					    "let's": [{ORTH: "let"}, {ORTH: "'s", NORM: "us"}],
 | 
				
			||||||
    ],
 | 
					    "Let's": [{ORTH: "Let", NORM: "let"}, {ORTH: "'s", NORM: "us"}],
 | 
				
			||||||
    "notve": [
 | 
					    "c'mon": [{ORTH: "c'm", NORM: "come"}, {ORTH: "on"}],
 | 
				
			||||||
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
 | 
					    "C'mon": [{ORTH: "C'm", NORM: "come"}, {ORTH: "on"}],
 | 
				
			||||||
        {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "Not've": [
 | 
					 | 
				
			||||||
        {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
 | 
					 | 
				
			||||||
        {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "Notve": [
 | 
					 | 
				
			||||||
        {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
 | 
					 | 
				
			||||||
        {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "cannot": [
 | 
					 | 
				
			||||||
        {ORTH: "can", LEMMA: "can", TAG: "MD"},
 | 
					 | 
				
			||||||
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "Cannot": [
 | 
					 | 
				
			||||||
        {ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
 | 
					 | 
				
			||||||
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "gonna": [
 | 
					 | 
				
			||||||
        {ORTH: "gon", LEMMA: "go", NORM: "going"},
 | 
					 | 
				
			||||||
        {ORTH: "na", LEMMA: "to", NORM: "to"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "Gonna": [
 | 
					 | 
				
			||||||
        {ORTH: "Gon", LEMMA: "go", NORM: "going"},
 | 
					 | 
				
			||||||
        {ORTH: "na", LEMMA: "to", NORM: "to"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "gotta": [{ORTH: "got"}, {ORTH: "ta", LEMMA: "to", NORM: "to"}],
 | 
					 | 
				
			||||||
    "Gotta": [{ORTH: "Got", NORM: "got"}, {ORTH: "ta", LEMMA: "to", NORM: "to"}],
 | 
					 | 
				
			||||||
    "let's": [{ORTH: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
 | 
					 | 
				
			||||||
    "Let's": [
 | 
					 | 
				
			||||||
        {ORTH: "Let", LEMMA: "let", NORM: "let"},
 | 
					 | 
				
			||||||
        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
 | 
					 | 
				
			||||||
    "C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc.update(_other_exc)
 | 
					_exc.update(_other_exc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "'S", LEMMA: "'s", NORM: "'s"},
 | 
					    {ORTH: "'S", NORM: "'s"},
 | 
				
			||||||
    {ORTH: "'s", LEMMA: "'s", NORM: "'s"},
 | 
					    {ORTH: "'s", NORM: "'s"},
 | 
				
			||||||
    {ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
 | 
					    {ORTH: "\u2018S", NORM: "'s"},
 | 
				
			||||||
    {ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
 | 
					    {ORTH: "\u2018s", NORM: "'s"},
 | 
				
			||||||
    {ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
 | 
					    {ORTH: "and/or", NORM: "and/or"},
 | 
				
			||||||
    {ORTH: "w/o", LEMMA: "without", NORM: "without"},
 | 
					    {ORTH: "w/o", NORM: "without"},
 | 
				
			||||||
    {ORTH: "'re", LEMMA: "be", NORM: "are"},
 | 
					    {ORTH: "'re", NORM: "are"},
 | 
				
			||||||
    {ORTH: "'Cause", LEMMA: "because", NORM: "because"},
 | 
					    {ORTH: "'Cause", NORM: "because"},
 | 
				
			||||||
    {ORTH: "'cause", LEMMA: "because", NORM: "because"},
 | 
					    {ORTH: "'cause", NORM: "because"},
 | 
				
			||||||
    {ORTH: "'cos", LEMMA: "because", NORM: "because"},
 | 
					    {ORTH: "'cos", NORM: "because"},
 | 
				
			||||||
    {ORTH: "'Cos", LEMMA: "because", NORM: "because"},
 | 
					    {ORTH: "'Cos", NORM: "because"},
 | 
				
			||||||
    {ORTH: "'coz", LEMMA: "because", NORM: "because"},
 | 
					    {ORTH: "'coz", NORM: "because"},
 | 
				
			||||||
    {ORTH: "'Coz", LEMMA: "because", NORM: "because"},
 | 
					    {ORTH: "'Coz", NORM: "because"},
 | 
				
			||||||
    {ORTH: "'cuz", LEMMA: "because", NORM: "because"},
 | 
					    {ORTH: "'cuz", NORM: "because"},
 | 
				
			||||||
    {ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
 | 
					    {ORTH: "'Cuz", NORM: "because"},
 | 
				
			||||||
    {ORTH: "'bout", LEMMA: "about", NORM: "about"},
 | 
					    {ORTH: "'bout", NORM: "about"},
 | 
				
			||||||
    {ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
 | 
					    {ORTH: "ma'am", NORM: "madam"},
 | 
				
			||||||
    {ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
 | 
					    {ORTH: "Ma'am", NORM: "madam"},
 | 
				
			||||||
    {ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
 | 
					    {ORTH: "o'clock", NORM: "o'clock"},
 | 
				
			||||||
    {ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
 | 
					    {ORTH: "O'clock", NORM: "o'clock"},
 | 
				
			||||||
    {ORTH: "lovin'", LEMMA: "love", NORM: "loving"},
 | 
					    {ORTH: "lovin'", NORM: "loving"},
 | 
				
			||||||
    {ORTH: "Lovin'", LEMMA: "love", NORM: "loving"},
 | 
					    {ORTH: "Lovin'", NORM: "loving"},
 | 
				
			||||||
    {ORTH: "lovin", LEMMA: "love", NORM: "loving"},
 | 
					    {ORTH: "lovin", NORM: "loving"},
 | 
				
			||||||
    {ORTH: "Lovin", LEMMA: "love", NORM: "loving"},
 | 
					    {ORTH: "Lovin", NORM: "loving"},
 | 
				
			||||||
    {ORTH: "havin'", LEMMA: "have", NORM: "having"},
 | 
					    {ORTH: "havin'", NORM: "having"},
 | 
				
			||||||
    {ORTH: "Havin'", LEMMA: "have", NORM: "having"},
 | 
					    {ORTH: "Havin'", NORM: "having"},
 | 
				
			||||||
    {ORTH: "havin", LEMMA: "have", NORM: "having"},
 | 
					    {ORTH: "havin", NORM: "having"},
 | 
				
			||||||
    {ORTH: "Havin", LEMMA: "have", NORM: "having"},
 | 
					    {ORTH: "Havin", NORM: "having"},
 | 
				
			||||||
    {ORTH: "doin'", LEMMA: "do", NORM: "doing"},
 | 
					    {ORTH: "doin'", NORM: "doing"},
 | 
				
			||||||
    {ORTH: "Doin'", LEMMA: "do", NORM: "doing"},
 | 
					    {ORTH: "Doin'", NORM: "doing"},
 | 
				
			||||||
    {ORTH: "doin", LEMMA: "do", NORM: "doing"},
 | 
					    {ORTH: "doin", NORM: "doing"},
 | 
				
			||||||
    {ORTH: "Doin", LEMMA: "do", NORM: "doing"},
 | 
					    {ORTH: "Doin", NORM: "doing"},
 | 
				
			||||||
    {ORTH: "goin'", LEMMA: "go", NORM: "going"},
 | 
					    {ORTH: "goin'", NORM: "going"},
 | 
				
			||||||
    {ORTH: "Goin'", LEMMA: "go", NORM: "going"},
 | 
					    {ORTH: "Goin'", NORM: "going"},
 | 
				
			||||||
    {ORTH: "goin", LEMMA: "go", NORM: "going"},
 | 
					    {ORTH: "goin", NORM: "going"},
 | 
				
			||||||
    {ORTH: "Goin", LEMMA: "go", NORM: "going"},
 | 
					    {ORTH: "Goin", NORM: "going"},
 | 
				
			||||||
    {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
 | 
					    {ORTH: "Mt.", NORM: "Mount"},
 | 
				
			||||||
    {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
 | 
					    {ORTH: "Ak.", NORM: "Alaska"},
 | 
				
			||||||
    {ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
 | 
					    {ORTH: "Ala.", NORM: "Alabama"},
 | 
				
			||||||
    {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
 | 
					    {ORTH: "Apr.", NORM: "April"},
 | 
				
			||||||
    {ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
 | 
					    {ORTH: "Ariz.", NORM: "Arizona"},
 | 
				
			||||||
    {ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
 | 
					    {ORTH: "Ark.", NORM: "Arkansas"},
 | 
				
			||||||
    {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
 | 
					    {ORTH: "Aug.", NORM: "August"},
 | 
				
			||||||
    {ORTH: "Calif.", LEMMA: "California", NORM: "California"},
 | 
					    {ORTH: "Calif.", NORM: "California"},
 | 
				
			||||||
    {ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
 | 
					    {ORTH: "Colo.", NORM: "Colorado"},
 | 
				
			||||||
    {ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
 | 
					    {ORTH: "Conn.", NORM: "Connecticut"},
 | 
				
			||||||
    {ORTH: "Dec.", LEMMA: "December", NORM: "December"},
 | 
					    {ORTH: "Dec.", NORM: "December"},
 | 
				
			||||||
    {ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
 | 
					    {ORTH: "Del.", NORM: "Delaware"},
 | 
				
			||||||
    {ORTH: "Feb.", LEMMA: "February", NORM: "February"},
 | 
					    {ORTH: "Feb.", NORM: "February"},
 | 
				
			||||||
    {ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
 | 
					    {ORTH: "Fla.", NORM: "Florida"},
 | 
				
			||||||
    {ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
 | 
					    {ORTH: "Ga.", NORM: "Georgia"},
 | 
				
			||||||
    {ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
 | 
					    {ORTH: "Ia.", NORM: "Iowa"},
 | 
				
			||||||
    {ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
 | 
					    {ORTH: "Id.", NORM: "Idaho"},
 | 
				
			||||||
    {ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
 | 
					    {ORTH: "Ill.", NORM: "Illinois"},
 | 
				
			||||||
    {ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
 | 
					    {ORTH: "Ind.", NORM: "Indiana"},
 | 
				
			||||||
    {ORTH: "Jan.", LEMMA: "January", NORM: "January"},
 | 
					    {ORTH: "Jan.", NORM: "January"},
 | 
				
			||||||
    {ORTH: "Jul.", LEMMA: "July", NORM: "July"},
 | 
					    {ORTH: "Jul.", NORM: "July"},
 | 
				
			||||||
    {ORTH: "Jun.", LEMMA: "June", NORM: "June"},
 | 
					    {ORTH: "Jun.", NORM: "June"},
 | 
				
			||||||
    {ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
 | 
					    {ORTH: "Kan.", NORM: "Kansas"},
 | 
				
			||||||
    {ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
 | 
					    {ORTH: "Kans.", NORM: "Kansas"},
 | 
				
			||||||
    {ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
 | 
					    {ORTH: "Ky.", NORM: "Kentucky"},
 | 
				
			||||||
    {ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
 | 
					    {ORTH: "La.", NORM: "Louisiana"},
 | 
				
			||||||
    {ORTH: "Mar.", LEMMA: "March", NORM: "March"},
 | 
					    {ORTH: "Mar.", NORM: "March"},
 | 
				
			||||||
    {ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
 | 
					    {ORTH: "Mass.", NORM: "Massachusetts"},
 | 
				
			||||||
    {ORTH: "May.", LEMMA: "May", NORM: "May"},
 | 
					    {ORTH: "May.", NORM: "May"},
 | 
				
			||||||
    {ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
 | 
					    {ORTH: "Mich.", NORM: "Michigan"},
 | 
				
			||||||
    {ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
 | 
					    {ORTH: "Minn.", NORM: "Minnesota"},
 | 
				
			||||||
    {ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
 | 
					    {ORTH: "Miss.", NORM: "Mississippi"},
 | 
				
			||||||
    {ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
 | 
					    {ORTH: "N.C.", NORM: "North Carolina"},
 | 
				
			||||||
    {ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
 | 
					    {ORTH: "N.D.", NORM: "North Dakota"},
 | 
				
			||||||
    {ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
 | 
					    {ORTH: "N.H.", NORM: "New Hampshire"},
 | 
				
			||||||
    {ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
 | 
					    {ORTH: "N.J.", NORM: "New Jersey"},
 | 
				
			||||||
    {ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
 | 
					    {ORTH: "N.M.", NORM: "New Mexico"},
 | 
				
			||||||
    {ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
 | 
					    {ORTH: "N.Y.", NORM: "New York"},
 | 
				
			||||||
    {ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
 | 
					    {ORTH: "Neb.", NORM: "Nebraska"},
 | 
				
			||||||
    {ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
 | 
					    {ORTH: "Nebr.", NORM: "Nebraska"},
 | 
				
			||||||
    {ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
 | 
					    {ORTH: "Nev.", NORM: "Nevada"},
 | 
				
			||||||
    {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
 | 
					    {ORTH: "Nov.", NORM: "November"},
 | 
				
			||||||
    {ORTH: "Oct.", LEMMA: "October", NORM: "October"},
 | 
					    {ORTH: "Oct.", NORM: "October"},
 | 
				
			||||||
    {ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
 | 
					    {ORTH: "Okla.", NORM: "Oklahoma"},
 | 
				
			||||||
    {ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
 | 
					    {ORTH: "Ore.", NORM: "Oregon"},
 | 
				
			||||||
    {ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
 | 
					    {ORTH: "Pa.", NORM: "Pennsylvania"},
 | 
				
			||||||
    {ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
 | 
					    {ORTH: "S.C.", NORM: "South Carolina"},
 | 
				
			||||||
    {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
 | 
					    {ORTH: "Sep.", NORM: "September"},
 | 
				
			||||||
    {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
 | 
					    {ORTH: "Sept.", NORM: "September"},
 | 
				
			||||||
    {ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
 | 
					    {ORTH: "Tenn.", NORM: "Tennessee"},
 | 
				
			||||||
    {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
 | 
					    {ORTH: "Va.", NORM: "Virginia"},
 | 
				
			||||||
    {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
 | 
					    {ORTH: "Wash.", NORM: "Washington"},
 | 
				
			||||||
    {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"},
 | 
					    {ORTH: "Wis.", NORM: "Wisconsin"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -555,4 +522,4 @@ for string in _exclude:
 | 
				
			||||||
        _exc.pop(string)
 | 
					        _exc.pop(string)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,52 +1,23 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.config import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "es"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.es.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.es.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.es.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SpanishDefaults(Language.Defaults):
 | 
					class SpanishDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Spanish(Language):
 | 
					class Spanish(Language):
 | 
				
			||||||
    lang = "es"
 | 
					    lang = "es"
 | 
				
			||||||
    Defaults = SpanishDefaults
 | 
					    Defaults = SpanishDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Spanish"]
 | 
					__all__ = ["Spanish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,15 @@
 | 
				
			||||||
 | 
					from typing import Union, Iterator, Optional, List, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
 | 
					from ...symbols import NOUN, PROPN, PRON, VERB, AUX
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span, Token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike):
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
 | 
				
			||||||
 | 
					    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
				
			||||||
    doc = doclike.doc
 | 
					    doc = doclike.doc
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not doc.is_parsed:
 | 
					    if not doc.is_parsed:
 | 
				
			||||||
        raise ValueError(Errors.E029)
 | 
					        raise ValueError(Errors.E029)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not len(doc):
 | 
					    if not len(doc):
 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
| 
						 | 
					@ -28,18 +30,24 @@ def noun_chunks(doclike):
 | 
				
			||||||
        token = next_token(token)
 | 
					        token = next_token(token)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_verb_token(token):
 | 
					def is_verb_token(token: Token) -> bool:
 | 
				
			||||||
    return token.pos in [VERB, AUX]
 | 
					    return token.pos in [VERB, AUX]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def next_token(token):
 | 
					def next_token(token: Token) -> Optional[Token]:
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        return token.nbor()
 | 
					        return token.nbor()
 | 
				
			||||||
    except IndexError:
 | 
					    except IndexError:
 | 
				
			||||||
        return None
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
 | 
					def noun_bounds(
 | 
				
			||||||
 | 
					    doc: Doc,
 | 
				
			||||||
 | 
					    root: Token,
 | 
				
			||||||
 | 
					    np_left_deps: List[str],
 | 
				
			||||||
 | 
					    np_right_deps: List[str],
 | 
				
			||||||
 | 
					    stop_deps: List[str],
 | 
				
			||||||
 | 
					) -> Tuple[Token, Token]:
 | 
				
			||||||
    left_bound = root
 | 
					    left_bound = root
 | 
				
			||||||
    for token in reversed(list(root.lefts)):
 | 
					    for token in reversed(list(root.lefts)):
 | 
				
			||||||
        if token.dep in np_left_deps:
 | 
					        if token.dep in np_left_deps:
 | 
				
			||||||
| 
						 | 
					@ -50,12 +58,8 @@ def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
 | 
				
			||||||
            left, right = noun_bounds(
 | 
					            left, right = noun_bounds(
 | 
				
			||||||
                doc, token, np_left_deps, np_right_deps, stop_deps
 | 
					                doc, token, np_left_deps, np_right_deps, stop_deps
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            if list(
 | 
					            filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
 | 
				
			||||||
                filter(
 | 
					            if list(filter(filter_func, doc[left_bound.i : right.i],)):
 | 
				
			||||||
                    lambda t: is_verb_token(t) or t.dep in stop_deps,
 | 
					 | 
				
			||||||
                    doc[left_bound.i : right.i],
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            ):
 | 
					 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                right_bound = right
 | 
					                right_bound = right
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,25 +1,27 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {
 | 
					_exc = {
 | 
				
			||||||
    "pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}],
 | 
					    "pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "n°", LEMMA: "número"},
 | 
					    {ORTH: "n°"},
 | 
				
			||||||
    {ORTH: "°C", LEMMA: "grados Celcius"},
 | 
					    {ORTH: "°C"},
 | 
				
			||||||
    {ORTH: "aprox.", LEMMA: "aproximadamente"},
 | 
					    {ORTH: "aprox."},
 | 
				
			||||||
    {ORTH: "dna.", LEMMA: "docena"},
 | 
					    {ORTH: "dna."},
 | 
				
			||||||
    {ORTH: "dpto.", LEMMA: "departamento"},
 | 
					    {ORTH: "dpto."},
 | 
				
			||||||
    {ORTH: "ej.", LEMMA: "ejemplo"},
 | 
					    {ORTH: "ej."},
 | 
				
			||||||
    {ORTH: "esq.", LEMMA: "esquina"},
 | 
					    {ORTH: "esq."},
 | 
				
			||||||
    {ORTH: "pág.", LEMMA: "página"},
 | 
					    {ORTH: "pág."},
 | 
				
			||||||
    {ORTH: "p.ej.", LEMMA: "por ejemplo"},
 | 
					    {ORTH: "p.ej."},
 | 
				
			||||||
    {ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"},
 | 
					    {ORTH: "Ud.", NORM: "usted"},
 | 
				
			||||||
    {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
 | 
					    {ORTH: "Vd.", NORM: "usted"},
 | 
				
			||||||
    {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
 | 
					    {ORTH: "Uds.", NORM: "ustedes"},
 | 
				
			||||||
    {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
 | 
					    {ORTH: "Vds.", NORM: "ustedes"},
 | 
				
			||||||
    {ORTH: "vol.", NORM: "volúmen"},
 | 
					    {ORTH: "vol.", NORM: "volúmen"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
| 
						 | 
					@ -27,14 +29,14 @@ for exc_data in [
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Times
 | 
					# Times
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]
 | 
					_exc["12m."] = [{ORTH: "12"}, {ORTH: "m."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for h in range(1, 12 + 1):
 | 
					for h in range(1, 12 + 1):
 | 
				
			||||||
    for period in ["a.m.", "am"]:
 | 
					    for period in ["a.m.", "am"]:
 | 
				
			||||||
        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period}]
 | 
				
			||||||
    for period in ["p.m.", "pm"]:
 | 
					    for period in ["p.m.", "pm"]:
 | 
				
			||||||
        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for orth in [
 | 
					for orth in [
 | 
				
			||||||
| 
						 | 
					@ -73,4 +75,4 @@ for orth in [
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class EstonianDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "et"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.et.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.et.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Estonian(Language):
 | 
					class Estonian(Language):
 | 
				
			||||||
    lang = "et"
 | 
					    lang = "et"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = EstonianDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Estonian"]
 | 
					__all__ = ["Estonian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,41 +1,18 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "eu"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.eu.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.eu.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.eu.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BasqueDefaults(Language.Defaults):
 | 
					class BasqueDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Basque(Language):
 | 
					class Basque(Language):
 | 
				
			||||||
    lang = "eu"
 | 
					    lang = "eu"
 | 
				
			||||||
    Defaults = BasqueDefaults
 | 
					    Defaults = BasqueDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Basque"]
 | 
					__all__ = ["Basque"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,55 +1,23 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from ...language import Language
 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "fa"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.fa.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.writing_system]
 | 
					 | 
				
			||||||
direction = "rtl"
 | 
					 | 
				
			||||||
has_case = false
 | 
					 | 
				
			||||||
has_letters = true
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.fa.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.fa.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PersianDefaults(Language.Defaults):
 | 
					class PersianDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Persian(Language):
 | 
					class Persian(Language):
 | 
				
			||||||
    lang = "fa"
 | 
					    lang = "fa"
 | 
				
			||||||
    Defaults = PersianDefaults
 | 
					    Defaults = PersianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Persian"]
 | 
					__all__ = ["Persian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| 
						 | 
					@ -1,43 +1,21 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "fi"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.fi.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.fi.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.fi.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class FinnishDefaults(Language.Defaults):
 | 
					class FinnishDefaults(Language.Defaults):
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Finnish(Language):
 | 
					class Finnish(Language):
 | 
				
			||||||
    lang = "fi"
 | 
					    lang = "fi"
 | 
				
			||||||
    Defaults = FinnishDefaults
 | 
					    Defaults = FinnishDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Finnish"]
 | 
					__all__ = ["Finnish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,6 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
| 
						 | 
					@ -6,76 +8,76 @@ _exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html
 | 
					# Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "aik.", LEMMA: "aikaisempi"},
 | 
					    {ORTH: "aik."},
 | 
				
			||||||
    {ORTH: "alk.", LEMMA: "alkaen"},
 | 
					    {ORTH: "alk."},
 | 
				
			||||||
    {ORTH: "alv.", LEMMA: "arvonlisävero"},
 | 
					    {ORTH: "alv."},
 | 
				
			||||||
    {ORTH: "ark.", LEMMA: "arkisin"},
 | 
					    {ORTH: "ark."},
 | 
				
			||||||
    {ORTH: "as.", LEMMA: "asunto"},
 | 
					    {ORTH: "as."},
 | 
				
			||||||
    {ORTH: "eaa.", LEMMA: "ennen ajanlaskun alkua"},
 | 
					    {ORTH: "eaa."},
 | 
				
			||||||
    {ORTH: "ed.", LEMMA: "edellinen"},
 | 
					    {ORTH: "ed."},
 | 
				
			||||||
    {ORTH: "esim.", LEMMA: "esimerkki"},
 | 
					    {ORTH: "esim."},
 | 
				
			||||||
    {ORTH: "huom.", LEMMA: "huomautus"},
 | 
					    {ORTH: "huom."},
 | 
				
			||||||
    {ORTH: "jne.", LEMMA: "ja niin edelleen"},
 | 
					    {ORTH: "jne."},
 | 
				
			||||||
    {ORTH: "joht.", LEMMA: "johtaja"},
 | 
					    {ORTH: "joht."},
 | 
				
			||||||
    {ORTH: "k.", LEMMA: "kuollut"},
 | 
					    {ORTH: "k."},
 | 
				
			||||||
    {ORTH: "ks.", LEMMA: "katso"},
 | 
					    {ORTH: "ks."},
 | 
				
			||||||
    {ORTH: "lk.", LEMMA: "luokka"},
 | 
					    {ORTH: "lk."},
 | 
				
			||||||
    {ORTH: "lkm.", LEMMA: "lukumäärä"},
 | 
					    {ORTH: "lkm."},
 | 
				
			||||||
    {ORTH: "lyh.", LEMMA: "lyhenne"},
 | 
					    {ORTH: "lyh."},
 | 
				
			||||||
    {ORTH: "läh.", LEMMA: "lähettäjä"},
 | 
					    {ORTH: "läh."},
 | 
				
			||||||
    {ORTH: "miel.", LEMMA: "mieluummin"},
 | 
					    {ORTH: "miel."},
 | 
				
			||||||
    {ORTH: "milj.", LEMMA: "miljoona"},
 | 
					    {ORTH: "milj."},
 | 
				
			||||||
    {ORTH: "Mm.", LEMMA: "muun muassa"},
 | 
					    {ORTH: "Mm."},
 | 
				
			||||||
    {ORTH: "mm.", LEMMA: "muun muassa"},
 | 
					    {ORTH: "mm."},
 | 
				
			||||||
    {ORTH: "myöh.", LEMMA: "myöhempi"},
 | 
					    {ORTH: "myöh."},
 | 
				
			||||||
    {ORTH: "n.", LEMMA: "noin"},
 | 
					    {ORTH: "n."},
 | 
				
			||||||
    {ORTH: "nimim.", LEMMA: "nimimerkki"},
 | 
					    {ORTH: "nimim."},
 | 
				
			||||||
    {ORTH: "n:o", LEMMA: "numero"},
 | 
					    {ORTH: "n:o"},
 | 
				
			||||||
    {ORTH: "N:o", LEMMA: "numero"},
 | 
					    {ORTH: "N:o"},
 | 
				
			||||||
    {ORTH: "nro", LEMMA: "numero"},
 | 
					    {ORTH: "nro"},
 | 
				
			||||||
    {ORTH: "ns.", LEMMA: "niin sanottu"},
 | 
					    {ORTH: "ns."},
 | 
				
			||||||
    {ORTH: "nyk.", LEMMA: "nykyinen"},
 | 
					    {ORTH: "nyk."},
 | 
				
			||||||
    {ORTH: "oik.", LEMMA: "oikealla"},
 | 
					    {ORTH: "oik."},
 | 
				
			||||||
    {ORTH: "os.", LEMMA: "osoite"},
 | 
					    {ORTH: "os."},
 | 
				
			||||||
    {ORTH: "p.", LEMMA: "päivä"},
 | 
					    {ORTH: "p."},
 | 
				
			||||||
    {ORTH: "par.", LEMMA: "paremmin"},
 | 
					    {ORTH: "par."},
 | 
				
			||||||
    {ORTH: "per.", LEMMA: "perustettu"},
 | 
					    {ORTH: "per."},
 | 
				
			||||||
    {ORTH: "pj.", LEMMA: "puheenjohtaja"},
 | 
					    {ORTH: "pj."},
 | 
				
			||||||
    {ORTH: "puh.joht.", LEMMA: "puheenjohtaja"},
 | 
					    {ORTH: "puh.joht."},
 | 
				
			||||||
    {ORTH: "prof.", LEMMA: "professori"},
 | 
					    {ORTH: "prof."},
 | 
				
			||||||
    {ORTH: "puh.", LEMMA: "puhelin"},
 | 
					    {ORTH: "puh."},
 | 
				
			||||||
    {ORTH: "pvm.", LEMMA: "päivämäärä"},
 | 
					    {ORTH: "pvm."},
 | 
				
			||||||
    {ORTH: "rak.", LEMMA: "rakennettu"},
 | 
					    {ORTH: "rak."},
 | 
				
			||||||
    {ORTH: "ry.", LEMMA: "rekisteröity yhdistys"},
 | 
					    {ORTH: "ry."},
 | 
				
			||||||
    {ORTH: "s.", LEMMA: "sivu"},
 | 
					    {ORTH: "s."},
 | 
				
			||||||
    {ORTH: "siht.", LEMMA: "sihteeri"},
 | 
					    {ORTH: "siht."},
 | 
				
			||||||
    {ORTH: "synt.", LEMMA: "syntynyt"},
 | 
					    {ORTH: "synt."},
 | 
				
			||||||
    {ORTH: "t.", LEMMA: "toivoo"},
 | 
					    {ORTH: "t."},
 | 
				
			||||||
    {ORTH: "tark.", LEMMA: "tarkastanut"},
 | 
					    {ORTH: "tark."},
 | 
				
			||||||
    {ORTH: "til.", LEMMA: "tilattu"},
 | 
					    {ORTH: "til."},
 | 
				
			||||||
    {ORTH: "tms.", LEMMA: "tai muuta sellaista"},
 | 
					    {ORTH: "tms."},
 | 
				
			||||||
    {ORTH: "toim.", LEMMA: "toimittanut"},
 | 
					    {ORTH: "toim."},
 | 
				
			||||||
    {ORTH: "v.", LEMMA: "vuosi"},
 | 
					    {ORTH: "v."},
 | 
				
			||||||
    {ORTH: "vas.", LEMMA: "vasen"},
 | 
					    {ORTH: "vas."},
 | 
				
			||||||
    {ORTH: "vast.", LEMMA: "vastaus"},
 | 
					    {ORTH: "vast."},
 | 
				
			||||||
    {ORTH: "vrt.", LEMMA: "vertaa"},
 | 
					    {ORTH: "vrt."},
 | 
				
			||||||
    {ORTH: "yht.", LEMMA: "yhteensä"},
 | 
					    {ORTH: "yht."},
 | 
				
			||||||
    {ORTH: "yl.", LEMMA: "yleinen"},
 | 
					    {ORTH: "yl."},
 | 
				
			||||||
    {ORTH: "ym.", LEMMA: "ynnä muuta"},
 | 
					    {ORTH: "ym."},
 | 
				
			||||||
    {ORTH: "yms.", LEMMA: "ynnä muuta sellaista"},
 | 
					    {ORTH: "yms."},
 | 
				
			||||||
    {ORTH: "yo.", LEMMA: "ylioppilas"},
 | 
					    {ORTH: "yo."},
 | 
				
			||||||
    {ORTH: "yliopp.", LEMMA: "ylioppilas"},
 | 
					    {ORTH: "yliopp."},
 | 
				
			||||||
    {ORTH: "ao.", LEMMA: "asianomainen"},
 | 
					    {ORTH: "ao."},
 | 
				
			||||||
    {ORTH: "em.", LEMMA: "edellä mainittu"},
 | 
					    {ORTH: "em."},
 | 
				
			||||||
    {ORTH: "ko.", LEMMA: "kyseessä oleva"},
 | 
					    {ORTH: "ko."},
 | 
				
			||||||
    {ORTH: "ml.", LEMMA: "mukaan luettuna"},
 | 
					    {ORTH: "ml."},
 | 
				
			||||||
    {ORTH: "po.", LEMMA: "puheena oleva"},
 | 
					    {ORTH: "po."},
 | 
				
			||||||
    {ORTH: "so.", LEMMA: "se on"},
 | 
					    {ORTH: "so."},
 | 
				
			||||||
    {ORTH: "ts.", LEMMA: "toisin sanoen"},
 | 
					    {ORTH: "ts."},
 | 
				
			||||||
    {ORTH: "vm.", LEMMA: "viimeksi mainittu"},
 | 
					    {ORTH: "vm."},
 | 
				
			||||||
    {ORTH: "srk.", LEMMA: "seurakunta"},
 | 
					    {ORTH: "srk."},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					from typing import Callable
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 | 
				
			||||||
| 
						 | 
					@ -6,56 +6,47 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .lemmatizer import FrenchLemmatizer, is_base_form
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from .lemmatizer import FrenchLemmatizer, is_base_form
 | 
				
			||||||
 | 
					from ...lookups import load_lookups
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "fr"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.fr.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.lemmatizer]
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
@lemmatizers = "spacy.FrenchLemmatizer.v1"
 | 
					@lemmatizers = "spacy.fr.FrenchLemmatizer"
 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
 | 
					@registry.lemmatizers("spacy.fr.FrenchLemmatizer")
 | 
				
			||||||
def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
 | 
					def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]:
 | 
				
			||||||
    return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
 | 
					    tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer:
 | 
				
			||||||
 | 
					        lookups = load_lookups(lang=nlp.lang, tables=tables)
 | 
				
			||||||
 | 
					        return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.fr.stop_words")
 | 
					    return lemmatizer_factory
 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.fr.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class FrenchDefaults(Language.Defaults):
 | 
					class FrenchDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    token_match = TOKEN_MATCH
 | 
					    token_match = TOKEN_MATCH
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class French(Language):
 | 
					class French(Language):
 | 
				
			||||||
    lang = "fr"
 | 
					    lang = "fr"
 | 
				
			||||||
    Defaults = FrenchDefaults
 | 
					    Defaults = FrenchDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["French"]
 | 
					__all__ = ["French"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,18 @@
 | 
				
			||||||
 | 
					from typing import Union, Iterator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					from ...symbols import NOUN, PROPN, PRON
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike):
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
 | 
				
			||||||
    """
 | 
					    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
				
			||||||
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
 | 
					    # fmt: off
 | 
				
			||||||
    """
 | 
					    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
 | 
				
			||||||
    labels = [
 | 
					    # fmt: on
 | 
				
			||||||
        "nsubj",
 | 
					 | 
				
			||||||
        "nsubj:pass",
 | 
					 | 
				
			||||||
        "obj",
 | 
					 | 
				
			||||||
        "iobj",
 | 
					 | 
				
			||||||
        "ROOT",
 | 
					 | 
				
			||||||
        "appos",
 | 
					 | 
				
			||||||
        "nmod",
 | 
					 | 
				
			||||||
        "nmod:poss",
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
					    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not doc.is_parsed:
 | 
					    if not doc.is_parsed:
 | 
				
			||||||
        raise ValueError(Errors.E029)
 | 
					        raise ValueError(Errors.E029)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
					    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,11 @@
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from .punctuation import ELISION, HYPHENS
 | 
					from .punctuation import ELISION, HYPHENS
 | 
				
			||||||
from ..char_classes import ALPHA_LOWER, ALPHA
 | 
					from ..char_classes import ALPHA_LOWER, ALPHA
 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
 | 
					# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
 | 
				
			||||||
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
 | 
					# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
 | 
				
			||||||
| 
						 | 
					@ -25,29 +28,29 @@ def lower_first_letter(text):
 | 
				
			||||||
    return text[0].lower() + text[1:]
 | 
					    return text[0].lower() + text[1:]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]}
 | 
					_exc = {"J.-C.": [{ORTH: "J."}, {ORTH: "-C."}]}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {LEMMA: "avant", ORTH: "av."},
 | 
					    {ORTH: "av."},
 | 
				
			||||||
    {LEMMA: "janvier", ORTH: "janv."},
 | 
					    {ORTH: "janv."},
 | 
				
			||||||
    {LEMMA: "février", ORTH: "févr."},
 | 
					    {ORTH: "févr."},
 | 
				
			||||||
    {LEMMA: "avril", ORTH: "avr."},
 | 
					    {ORTH: "avr."},
 | 
				
			||||||
    {LEMMA: "juillet", ORTH: "juill."},
 | 
					    {ORTH: "juill."},
 | 
				
			||||||
    {LEMMA: "septembre", ORTH: "sept."},
 | 
					    {ORTH: "sept."},
 | 
				
			||||||
    {LEMMA: "octobre", ORTH: "oct."},
 | 
					    {ORTH: "oct."},
 | 
				
			||||||
    {LEMMA: "novembre", ORTH: "nov."},
 | 
					    {ORTH: "nov."},
 | 
				
			||||||
    {LEMMA: "décembre", ORTH: "déc."},
 | 
					    {ORTH: "déc."},
 | 
				
			||||||
    {LEMMA: "après", ORTH: "apr."},
 | 
					    {ORTH: "apr."},
 | 
				
			||||||
    {LEMMA: "docteur", ORTH: "Dr."},
 | 
					    {ORTH: "Dr."},
 | 
				
			||||||
    {LEMMA: "monsieur", ORTH: "M."},
 | 
					    {ORTH: "M."},
 | 
				
			||||||
    {LEMMA: "monsieur", ORTH: "Mr."},
 | 
					    {ORTH: "Mr."},
 | 
				
			||||||
    {LEMMA: "madame", ORTH: "Mme."},
 | 
					    {ORTH: "Mme."},
 | 
				
			||||||
    {LEMMA: "mademoiselle", ORTH: "Mlle."},
 | 
					    {ORTH: "Mlle."},
 | 
				
			||||||
    {LEMMA: "numéro", ORTH: "n°"},
 | 
					    {ORTH: "n°"},
 | 
				
			||||||
    {LEMMA: "degrés", ORTH: "d°"},
 | 
					    {ORTH: "d°"},
 | 
				
			||||||
    {LEMMA: "saint", ORTH: "St."},
 | 
					    {ORTH: "St."},
 | 
				
			||||||
    {LEMMA: "sainte", ORTH: "Ste."},
 | 
					    {ORTH: "Ste."},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,55 +80,37 @@ for orth in [
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for verb, verb_lemma in [
 | 
					for verb in [
 | 
				
			||||||
    ("a", "avoir"),
 | 
					    "a",
 | 
				
			||||||
    ("est", "être"),
 | 
					    "est" "semble",
 | 
				
			||||||
    ("semble", "sembler"),
 | 
					    "indique",
 | 
				
			||||||
    ("indique", "indiquer"),
 | 
					    "moque",
 | 
				
			||||||
    ("moque", "moquer"),
 | 
					    "passe",
 | 
				
			||||||
    ("passe", "passer"),
 | 
					 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    for orth in [verb, verb.title()]:
 | 
					    for orth in [verb, verb.title()]:
 | 
				
			||||||
        for pronoun in ["elle", "il", "on"]:
 | 
					        for pronoun in ["elle", "il", "on"]:
 | 
				
			||||||
            token = f"{orth}-t-{pronoun}"
 | 
					            token = f"{orth}-t-{pronoun}"
 | 
				
			||||||
            _exc[token] = [
 | 
					            _exc[token] = [{ORTH: orth}, {ORTH: "-t"}, {ORTH: "-" + pronoun}]
 | 
				
			||||||
                {LEMMA: verb_lemma, ORTH: orth},  # , TAG: "VERB"},
 | 
					 | 
				
			||||||
                {LEMMA: "t", ORTH: "-t"},
 | 
					 | 
				
			||||||
                {LEMMA: pronoun, ORTH: "-" + pronoun},
 | 
					 | 
				
			||||||
            ]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
for verb, verb_lemma in [("est", "être")]:
 | 
					for verb in ["est"]:
 | 
				
			||||||
    for orth in [verb, verb.title()]:
 | 
					    for orth in [verb, verb.title()]:
 | 
				
			||||||
        token = f"{orth}-ce"
 | 
					        _exc[f"{orth}-ce"] = [{ORTH: orth}, {ORTH: "-ce"}]
 | 
				
			||||||
        _exc[token] = [
 | 
					 | 
				
			||||||
            {LEMMA: verb_lemma, ORTH: orth},  # , TAG: "VERB"},
 | 
					 | 
				
			||||||
            {LEMMA: "ce", ORTH: "-ce"},
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]:
 | 
					for pre in ["qu'", "n'"]:
 | 
				
			||||||
    for orth in [pre, pre.title()]:
 | 
					    for orth in [pre, pre.title()]:
 | 
				
			||||||
        _exc[f"{orth}est-ce"] = [
 | 
					        _exc[f"{orth}est-ce"] = [{ORTH: orth}, {ORTH: "est"}, {ORTH: "-ce"}]
 | 
				
			||||||
            {LEMMA: pre_lemma, ORTH: orth},
 | 
					 | 
				
			||||||
            {LEMMA: "être", ORTH: "est"},
 | 
					 | 
				
			||||||
            {LEMMA: "ce", ORTH: "-ce"},
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for verb, pronoun in [("est", "il"), ("EST", "IL")]:
 | 
					for verb, pronoun in [("est", "il"), ("EST", "IL")]:
 | 
				
			||||||
    token = "{}-{}".format(verb, pronoun)
 | 
					    _exc[f"{verb}-{pronoun}"] = [{ORTH: verb}, {ORTH: "-" + pronoun}]
 | 
				
			||||||
    _exc[token] = [
 | 
					 | 
				
			||||||
        {LEMMA: "être", ORTH: verb},
 | 
					 | 
				
			||||||
        {LEMMA: pronoun, ORTH: "-" + pronoun},
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
 | 
					for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
 | 
				
			||||||
    token = "{}'{}-{}".format(s, verb, pronoun)
 | 
					    _exc[f"{s}'{verb}-{pronoun}"] = [
 | 
				
			||||||
    _exc[token] = [
 | 
					        {ORTH: s + "'"},
 | 
				
			||||||
        {LEMMA: "se", ORTH: s + "'"},
 | 
					        {ORTH: verb},
 | 
				
			||||||
        {LEMMA: "être", ORTH: verb},
 | 
					        {ORTH: "-" + pronoun},
 | 
				
			||||||
        {LEMMA: pronoun, ORTH: "-" + pronoun},
 | 
					 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -452,7 +437,7 @@ _regular_exp += [
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
TOKEN_MATCH = re.compile(
 | 
					TOKEN_MATCH = re.compile(
 | 
				
			||||||
    "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
 | 
					    "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
 | 
				
			||||||
).match
 | 
					).match
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,16 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "ga"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.ga.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ga.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class IrishDefaults(Language.Defaults):
 | 
					class IrishDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Irish(Language):
 | 
					class Irish(Language):
 | 
				
			||||||
    lang = "ga"
 | 
					    lang = "ga"
 | 
				
			||||||
    Defaults = IrishDefaults
 | 
					    Defaults = IrishDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Irish"]
 | 
					__all__ = ["Irish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,79 +1,65 @@
 | 
				
			||||||
from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...symbols import ORTH, LEMMA, NORM
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {
 | 
					_exc = {
 | 
				
			||||||
    "'acha'n": [
 | 
					    "'acha'n": [{ORTH: "'ach", NORM: "gach"}, {ORTH: "a'n", NORM: "aon"}],
 | 
				
			||||||
        {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET},
 | 
					    "dem'": [{ORTH: "de", NORM: "de"}, {ORTH: "m'", NORM: "mo"}],
 | 
				
			||||||
        {ORTH: "a'n", LEMMA: "aon", NORM: "aon", POS: DET},
 | 
					    "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
 | 
				
			||||||
    ],
 | 
					    "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
 | 
				
			||||||
    "dem'": [
 | 
					    "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
 | 
				
			||||||
        {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},
 | 
					 | 
				
			||||||
        {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "ded'": [
 | 
					 | 
				
			||||||
        {ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},
 | 
					 | 
				
			||||||
        {ORTH: "d'", LEMMA: "do", NORM: "do", POS: DET},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "lem'": [
 | 
					 | 
				
			||||||
        {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
 | 
					 | 
				
			||||||
        {ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "led'": [
 | 
					 | 
				
			||||||
        {ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
 | 
					 | 
				
			||||||
        {ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "'gus", LEMMA: "agus", NORM: "agus", POS: CCONJ},
 | 
					    {ORTH: "'gus", NORM: "agus"},
 | 
				
			||||||
    {ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET},
 | 
					    {ORTH: "'ach", NORM: "gach"},
 | 
				
			||||||
    {ORTH: "ao'", LEMMA: "aon", NORM: "aon"},
 | 
					    {ORTH: "ao'", NORM: "aon"},
 | 
				
			||||||
    {ORTH: "'niar", LEMMA: "aniar", NORM: "aniar", POS: ADV},
 | 
					    {ORTH: "'niar", NORM: "aniar"},
 | 
				
			||||||
    {ORTH: "'níos", LEMMA: "aníos", NORM: "aníos", POS: ADV},
 | 
					    {ORTH: "'níos", NORM: "aníos"},
 | 
				
			||||||
    {ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu", POS: ADV},
 | 
					    {ORTH: "'ndiu", NORM: "inniu"},
 | 
				
			||||||
    {ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht", POS: ADV},
 | 
					    {ORTH: "'nocht", NORM: "anocht"},
 | 
				
			||||||
    {ORTH: "m'", LEMMA: "mo", POS: DET},
 | 
					    {ORTH: "m'"},
 | 
				
			||||||
    {ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN},
 | 
					    {ORTH: "Aib."},
 | 
				
			||||||
    {ORTH: "Ath.", LEMMA: "athair", POS: NOUN},
 | 
					    {ORTH: "Ath."},
 | 
				
			||||||
    {ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN},
 | 
					    {ORTH: "Beal."},
 | 
				
			||||||
    {ORTH: "a.C.n.", LEMMA: "ante Christum natum", POS: X},
 | 
					    {ORTH: "a.C.n."},
 | 
				
			||||||
    {ORTH: "m.sh.", LEMMA: "mar shampla", POS: ADV},
 | 
					    {ORTH: "m.sh."},
 | 
				
			||||||
    {ORTH: "M.F.", LEMMA: "Meán Fómhair", POS: NOUN},
 | 
					    {ORTH: "M.F."},
 | 
				
			||||||
    {ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN},
 | 
					    {ORTH: "M.Fómh."},
 | 
				
			||||||
    {ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN},
 | 
					    {ORTH: "D.F."},
 | 
				
			||||||
    {ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN},
 | 
					    {ORTH: "D.Fómh."},
 | 
				
			||||||
    {ORTH: "r.C.", LEMMA: "roimh Chríost", POS: ADV},
 | 
					    {ORTH: "r.C."},
 | 
				
			||||||
    {ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV},
 | 
					    {ORTH: "R.C."},
 | 
				
			||||||
    {ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV},
 | 
					    {ORTH: "r.Ch."},
 | 
				
			||||||
    {ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV},
 | 
					    {ORTH: "r.Chr."},
 | 
				
			||||||
    {ORTH: "R.Ch.", LEMMA: "roimh Chríost", POS: ADV},
 | 
					    {ORTH: "R.Ch."},
 | 
				
			||||||
    {ORTH: "R.Chr.", LEMMA: "roimh Chríost", POS: ADV},
 | 
					    {ORTH: "R.Chr."},
 | 
				
			||||||
    {ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV},
 | 
					    {ORTH: "⁊rl."},
 | 
				
			||||||
    {ORTH: "srl.", LEMMA: "agus araile", POS: ADV},
 | 
					    {ORTH: "srl."},
 | 
				
			||||||
    {ORTH: "Co.", LEMMA: "contae", POS: NOUN},
 | 
					    {ORTH: "Co."},
 | 
				
			||||||
    {ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN},
 | 
					    {ORTH: "Ean."},
 | 
				
			||||||
    {ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN},
 | 
					    {ORTH: "Feab."},
 | 
				
			||||||
    {ORTH: "gCo.", LEMMA: "contae", POS: NOUN},
 | 
					    {ORTH: "gCo."},
 | 
				
			||||||
    {ORTH: ".i.", LEMMA: "eadhon", POS: ADV},
 | 
					    {ORTH: ".i."},
 | 
				
			||||||
    {ORTH: "B'", LEMMA: "ba", POS: AUX},
 | 
					    {ORTH: "B'"},
 | 
				
			||||||
    {ORTH: "b'", LEMMA: "ba", POS: AUX},
 | 
					    {ORTH: "b'"},
 | 
				
			||||||
    {ORTH: "lch.", LEMMA: "leathanach", POS: NOUN},
 | 
					    {ORTH: "lch."},
 | 
				
			||||||
    {ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN},
 | 
					    {ORTH: "Lch."},
 | 
				
			||||||
    {ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN},
 | 
					    {ORTH: "lgh."},
 | 
				
			||||||
    {ORTH: "Lgh.", LEMMA: "leathanach", POS: NOUN},
 | 
					    {ORTH: "Lgh."},
 | 
				
			||||||
    {ORTH: "Lún.", LEMMA: "Lúnasa", POS: NOUN},
 | 
					    {ORTH: "Lún."},
 | 
				
			||||||
    {ORTH: "Már.", LEMMA: "Márta", POS: NOUN},
 | 
					    {ORTH: "Már."},
 | 
				
			||||||
    {ORTH: "Meith.", LEMMA: "Meitheamh", POS: NOUN},
 | 
					    {ORTH: "Meith."},
 | 
				
			||||||
    {ORTH: "Noll.", LEMMA: "Nollaig", POS: NOUN},
 | 
					    {ORTH: "Noll."},
 | 
				
			||||||
    {ORTH: "Samh.", LEMMA: "Samhain", POS: NOUN},
 | 
					    {ORTH: "Samh."},
 | 
				
			||||||
    {ORTH: "tAth.", LEMMA: "athair", POS: NOUN},
 | 
					    {ORTH: "tAth."},
 | 
				
			||||||
    {ORTH: "tUas.", LEMMA: "Uasal", POS: NOUN},
 | 
					    {ORTH: "tUas."},
 | 
				
			||||||
    {ORTH: "teo.", LEMMA: "teoranta", POS: NOUN},
 | 
					    {ORTH: "teo."},
 | 
				
			||||||
    {ORTH: "Teo.", LEMMA: "teoranta", POS: NOUN},
 | 
					    {ORTH: "Teo."},
 | 
				
			||||||
    {ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN},
 | 
					    {ORTH: "Uas."},
 | 
				
			||||||
    {ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN},
 | 
					    {ORTH: "uimh."},
 | 
				
			||||||
    {ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN},
 | 
					    {ORTH: "Uimh."},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -81,4 +67,4 @@ for orth in ["d'", "D'"]:
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class GujaratiDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "gu"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.gu.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.gu.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Gujarati(Language):
 | 
					class Gujarati(Language):
 | 
				
			||||||
    lang = "gu"
 | 
					    lang = "gu"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = GujaratiDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Gujarati"]
 | 
					__all__ = ["Gujarati"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,37 +1,15 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "he"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.he.stop_words"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.writing_system]
 | 
					 | 
				
			||||||
direction = "rtl"
 | 
					 | 
				
			||||||
has_case = false
 | 
					 | 
				
			||||||
has_letters = true
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.he.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class HebrewDefaults(Language.Defaults):
 | 
					class HebrewDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					    writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Hebrew(Language):
 | 
					class Hebrew(Language):
 | 
				
			||||||
    lang = "he"
 | 
					    lang = "he"
 | 
				
			||||||
    Defaults = HebrewDefaults
 | 
					    Defaults = HebrewDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Hebrew"]
 | 
					__all__ = ["Hebrew"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,16 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class HindiDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "hi"
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
stop_words = {"@language_data": "spacy.hi.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.hi.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.hi.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Hindi(Language):
 | 
					class Hindi(Language):
 | 
				
			||||||
    lang = "hi"
 | 
					    lang = "hi"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = HindiDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Hindi"]
 | 
					__all__ = ["Hindi"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,39 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "hr"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.hr.stop_words"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.hr.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CroatianDefaults(Language.Defaults):
 | 
					class CroatianDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Croatian(Language):
 | 
					class Croatian(Language):
 | 
				
			||||||
    lang = "hr"
 | 
					    lang = "hr"
 | 
				
			||||||
    Defaults = CroatianDefaults
 | 
					    Defaults = CroatianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Croatian"]
 | 
					__all__ = ["Croatian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,45 +1,21 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "hu"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.hu.stop_words"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.hu.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class HungarianDefaults(Language.Defaults):
 | 
					class HungarianDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    token_match = TOKEN_MATCH
 | 
					    token_match = TOKEN_MATCH
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Hungarian(Language):
 | 
					class Hungarian(Language):
 | 
				
			||||||
    lang = "hu"
 | 
					    lang = "hu"
 | 
				
			||||||
    Defaults = HungarianDefaults
 | 
					    Defaults = HungarianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Hungarian"]
 | 
					__all__ = ["Hungarian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,9 @@
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..punctuation import ALPHA_LOWER, CURRENCY
 | 
					from ..punctuation import ALPHA_LOWER, CURRENCY
 | 
				
			||||||
from ...symbols import ORTH
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
| 
						 | 
					@ -644,5 +646,5 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
 | 
					TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,16 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class ArmenianDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
lang = "hy"
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
stop_words = {"@language_data": "spacy.hy.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.hy.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.hy.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Armenian(Language):
 | 
					class Armenian(Language):
 | 
				
			||||||
    lang = "hy"
 | 
					    lang = "hy"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = ArmenianDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Armenian"]
 | 
					__all__ = ["Armenian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,53 +1,24 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.config import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "id"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.id.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.id.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.id.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class IndonesianDefaults(Language.Defaults):
 | 
					class IndonesianDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Indonesian(Language):
 | 
					class Indonesian(Language):
 | 
				
			||||||
    lang = "id"
 | 
					    lang = "id"
 | 
				
			||||||
    Defaults = IndonesianDefaults
 | 
					    Defaults = IndonesianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Indonesian"]
 | 
					__all__ = ["Indonesian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,20 @@
 | 
				
			||||||
 | 
					from typing import Union, Iterator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					from ...symbols import NOUN, PROPN, PRON
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike):
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
 | 
					    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    labels = [
 | 
					    # fmt: off
 | 
				
			||||||
        "nsubj",
 | 
					    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
 | 
				
			||||||
        "nsubj:pass",
 | 
					    # fmt: on
 | 
				
			||||||
        "obj",
 | 
					 | 
				
			||||||
        "iobj",
 | 
					 | 
				
			||||||
        "ROOT",
 | 
					 | 
				
			||||||
        "appos",
 | 
					 | 
				
			||||||
        "nmod",
 | 
					 | 
				
			||||||
        "nmod:poss",
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
					    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not doc.is_parsed:
 | 
					    if not doc.is_parsed:
 | 
				
			||||||
        raise ValueError(Errors.E029)
 | 
					        raise ValueError(Errors.E029)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
					    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,8 @@
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
 | 
					from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
 | 
				
			||||||
from ...symbols import ORTH, LEMMA, NORM
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Daftar singkatan dan Akronim dari:
 | 
					# Daftar singkatan dan Akronim dari:
 | 
				
			||||||
# https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
 | 
					# https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
 | 
				
			||||||
| 
						 | 
					@ -8,53 +11,47 @@ _exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for orth in ID_BASE_EXCEPTIONS:
 | 
					for orth in ID_BASE_EXCEPTIONS:
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    orth_title = orth.title()
 | 
					    orth_title = orth.title()
 | 
				
			||||||
    _exc[orth_title] = [{ORTH: orth_title}]
 | 
					    _exc[orth_title] = [{ORTH: orth_title}]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    orth_caps = orth.upper()
 | 
					    orth_caps = orth.upper()
 | 
				
			||||||
    _exc[orth_caps] = [{ORTH: orth_caps}]
 | 
					    _exc[orth_caps] = [{ORTH: orth_caps}]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    orth_lower = orth.lower()
 | 
					    orth_lower = orth.lower()
 | 
				
			||||||
    _exc[orth_lower] = [{ORTH: orth_lower}]
 | 
					    _exc[orth_lower] = [{ORTH: orth_lower}]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    orth_first_upper = orth[0].upper() + orth[1:]
 | 
					    orth_first_upper = orth[0].upper() + orth[1:]
 | 
				
			||||||
    _exc[orth_first_upper] = [{ORTH: orth_first_upper}]
 | 
					    _exc[orth_first_upper] = [{ORTH: orth_first_upper}]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if "-" in orth:
 | 
					    if "-" in orth:
 | 
				
			||||||
        orth_title = "-".join([part.title() for part in orth.split("-")])
 | 
					        orth_title = "-".join([part.title() for part in orth.split("-")])
 | 
				
			||||||
        _exc[orth_title] = [{ORTH: orth_title}]
 | 
					        _exc[orth_title] = [{ORTH: orth_title}]
 | 
				
			||||||
 | 
					 | 
				
			||||||
        orth_caps = "-".join([part.upper() for part in orth.split("-")])
 | 
					        orth_caps = "-".join([part.upper() for part in orth.split("-")])
 | 
				
			||||||
        _exc[orth_caps] = [{ORTH: orth_caps}]
 | 
					        _exc[orth_caps] = [{ORTH: orth_caps}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "Jan.", LEMMA: "Januari", NORM: "Januari"},
 | 
					    {ORTH: "Jan.", NORM: "Januari"},
 | 
				
			||||||
    {ORTH: "Feb.", LEMMA: "Februari", NORM: "Februari"},
 | 
					    {ORTH: "Feb.", NORM: "Februari"},
 | 
				
			||||||
    {ORTH: "Mar.", LEMMA: "Maret", NORM: "Maret"},
 | 
					    {ORTH: "Mar.", NORM: "Maret"},
 | 
				
			||||||
    {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
 | 
					    {ORTH: "Apr.", NORM: "April"},
 | 
				
			||||||
    {ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"},
 | 
					    {ORTH: "Jun.", NORM: "Juni"},
 | 
				
			||||||
    {ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"},
 | 
					    {ORTH: "Jul.", NORM: "Juli"},
 | 
				
			||||||
    {ORTH: "Agu.", LEMMA: "Agustus", NORM: "Agustus"},
 | 
					    {ORTH: "Agu.", NORM: "Agustus"},
 | 
				
			||||||
    {ORTH: "Ags.", LEMMA: "Agustus", NORM: "Agustus"},
 | 
					    {ORTH: "Ags.", NORM: "Agustus"},
 | 
				
			||||||
    {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
 | 
					    {ORTH: "Sep.", NORM: "September"},
 | 
				
			||||||
    {ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"},
 | 
					    {ORTH: "Okt.", NORM: "Oktober"},
 | 
				
			||||||
    {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
 | 
					    {ORTH: "Nov.", NORM: "November"},
 | 
				
			||||||
    {ORTH: "Des.", LEMMA: "Desember", NORM: "Desember"},
 | 
					    {ORTH: "Des.", NORM: "Desember"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_other_exc = {
 | 
					_other_exc = {
 | 
				
			||||||
    "do'a": [{ORTH: "do'a", LEMMA: "doa", NORM: "doa"}],
 | 
					    "do'a": [{ORTH: "do'a", NORM: "doa"}],
 | 
				
			||||||
    "jum'at": [{ORTH: "jum'at", LEMMA: "Jumat", NORM: "Jumat"}],
 | 
					    "jum'at": [{ORTH: "jum'at", NORM: "Jumat"}],
 | 
				
			||||||
    "Jum'at": [{ORTH: "Jum'at", LEMMA: "Jumat", NORM: "Jumat"}],
 | 
					    "Jum'at": [{ORTH: "Jum'at", NORM: "Jumat"}],
 | 
				
			||||||
    "la'nat": [{ORTH: "la'nat", LEMMA: "laknat", NORM: "laknat"}],
 | 
					    "la'nat": [{ORTH: "la'nat", NORM: "laknat"}],
 | 
				
			||||||
    "ma'af": [{ORTH: "ma'af", LEMMA: "maaf", NORM: "maaf"}],
 | 
					    "ma'af": [{ORTH: "ma'af", NORM: "maaf"}],
 | 
				
			||||||
    "mu'jizat": [{ORTH: "mu'jizat", LEMMA: "mukjizat", NORM: "mukjizat"}],
 | 
					    "mu'jizat": [{ORTH: "mu'jizat", NORM: "mukjizat"}],
 | 
				
			||||||
    "Mu'jizat": [{ORTH: "Mu'jizat", LEMMA: "mukjizat", NORM: "mukjizat"}],
 | 
					    "Mu'jizat": [{ORTH: "Mu'jizat", NORM: "mukjizat"}],
 | 
				
			||||||
    "ni'mat": [{ORTH: "ni'mat", LEMMA: "nikmat", NORM: "nikmat"}],
 | 
					    "ni'mat": [{ORTH: "ni'mat", NORM: "nikmat"}],
 | 
				
			||||||
    "raka'at": [{ORTH: "raka'at", LEMMA: "rakaat", NORM: "rakaat"}],
 | 
					    "raka'at": [{ORTH: "raka'at", NORM: "rakaat"}],
 | 
				
			||||||
    "ta'at": [{ORTH: "ta'at", LEMMA: "taat", NORM: "taat"}],
 | 
					    "ta'at": [{ORTH: "ta'at", NORM: "taat"}],
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc.update(_other_exc)
 | 
					_exc.update(_other_exc)
 | 
				
			||||||
| 
						 | 
					@ -221,4 +218,4 @@ for orth in [
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class IcelandicDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "is"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.is.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.is.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Icelandic(Language):
 | 
					class Icelandic(Language):
 | 
				
			||||||
    lang = "is"
 | 
					    lang = "is"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = IcelandicDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Icelandic"]
 | 
					__all__ = ["Icelandic"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,35 +1,11 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "it"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.it.stop_words"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.it.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ItalianDefaults(Language.Defaults):
 | 
					class ItalianDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
| 
						 | 
					@ -38,7 +14,6 @@ class ItalianDefaults(Language.Defaults):
 | 
				
			||||||
class Italian(Language):
 | 
					class Italian(Language):
 | 
				
			||||||
    lang = "it"
 | 
					    lang = "it"
 | 
				
			||||||
    Defaults = ItalianDefaults
 | 
					    Defaults = ItalianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Italian"]
 | 
					__all__ = ["Italian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,7 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {
 | 
					_exc = {
 | 
				
			||||||
    "all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
 | 
					    "all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
 | 
				
			||||||
| 
						 | 
					@ -7,7 +10,7 @@ _exc = {
 | 
				
			||||||
    "L'art.": [{ORTH: "L'"}, {ORTH: "art."}],
 | 
					    "L'art.": [{ORTH: "L'"}, {ORTH: "art."}],
 | 
				
			||||||
    "l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
 | 
					    "l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
 | 
				
			||||||
    "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
 | 
					    "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
 | 
				
			||||||
    "po'": [{ORTH: "po'", LEMMA: "poco"}],
 | 
					    "po'": [{ORTH: "po'"}],
 | 
				
			||||||
    "sett..": [{ORTH: "sett."}, {ORTH: "."}],
 | 
					    "sett..": [{ORTH: "sett."}, {ORTH: "."}],
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -52,4 +55,4 @@ for orth in [
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
from typing import Optional, Union, Dict, Any, Set
 | 
					from typing import Optional, Union, Dict, Any
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from collections import namedtuple
 | 
					from collections import namedtuple
 | 
				
			||||||
| 
						 | 
					@ -20,27 +20,15 @@ from ... import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "ja"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.ja.stop_words"}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.tokenizer]
 | 
					[nlp.tokenizer]
 | 
				
			||||||
@tokenizers = "spacy.JapaneseTokenizer.v1"
 | 
					@tokenizers = "spacy.ja.JapaneseTokenizer"
 | 
				
			||||||
split_mode = null
 | 
					split_mode = null
 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.writing_system]
 | 
					 | 
				
			||||||
direction = "ltr"
 | 
					 | 
				
			||||||
has_case = false
 | 
					 | 
				
			||||||
has_letters = false
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.ja.stop_words")
 | 
					@registry.tokenizers("spacy.ja.JapaneseTokenizer")
 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					def create_tokenizer(split_mode: Optional[str] = None):
 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
 | 
					 | 
				
			||||||
def create_japanese_tokenizer(split_mode: Optional[str] = None):
 | 
					 | 
				
			||||||
    def japanese_tokenizer_factory(nlp):
 | 
					    def japanese_tokenizer_factory(nlp):
 | 
				
			||||||
        return JapaneseTokenizer(nlp, split_mode=split_mode)
 | 
					        return JapaneseTokenizer(nlp, split_mode=split_mode)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -50,6 +38,8 @@ def create_japanese_tokenizer(split_mode: Optional[str] = None):
 | 
				
			||||||
class JapaneseTokenizer(DummyTokenizer):
 | 
					class JapaneseTokenizer(DummyTokenizer):
 | 
				
			||||||
    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
 | 
					    def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
 | 
				
			||||||
        self.vocab = nlp.vocab
 | 
					        self.vocab = nlp.vocab
 | 
				
			||||||
 | 
					        # TODO: is this the right way to do it?
 | 
				
			||||||
 | 
					        self.vocab.morphology.load_tag_map(TAG_MAP)
 | 
				
			||||||
        self.split_mode = split_mode
 | 
					        self.split_mode = split_mode
 | 
				
			||||||
        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
					        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -172,14 +162,15 @@ class JapaneseTokenizer(DummyTokenizer):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class JapaneseDefaults(Language.Defaults):
 | 
					class JapaneseDefaults(Language.Defaults):
 | 
				
			||||||
    tag_map = TAG_MAP
 | 
					    config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Japanese(Language):
 | 
					class Japanese(Language):
 | 
				
			||||||
    lang = "ja"
 | 
					    lang = "ja"
 | 
				
			||||||
    Defaults = JapaneseDefaults
 | 
					    Defaults = JapaneseDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Hold the attributes we need with convenient names
 | 
					# Hold the attributes we need with convenient names
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,23 @@
 | 
				
			||||||
 | 
					from typing import Union, Iterator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON, VERB
 | 
					from ...symbols import NOUN, PROPN, PRON, VERB
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
# XXX this can probably be pruned a bit
 | 
					 | 
				
			||||||
labels = [
 | 
					 | 
				
			||||||
    "nsubj",
 | 
					 | 
				
			||||||
    "nmod",
 | 
					 | 
				
			||||||
    "dobj",
 | 
					 | 
				
			||||||
    "nsubjpass",
 | 
					 | 
				
			||||||
    "pcomp",
 | 
					 | 
				
			||||||
    "pobj",
 | 
					 | 
				
			||||||
    "obj",
 | 
					 | 
				
			||||||
    "obl",
 | 
					 | 
				
			||||||
    "dative",
 | 
					 | 
				
			||||||
    "appos",
 | 
					 | 
				
			||||||
    "attr",
 | 
					 | 
				
			||||||
    "ROOT",
 | 
					 | 
				
			||||||
]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(obj):
 | 
					# TODO: this can probably be pruned a bit
 | 
				
			||||||
    """
 | 
					# fmt: off
 | 
				
			||||||
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
 | 
					labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclike", "obl", "dative", "appos", "attr", "ROOT"]
 | 
				
			||||||
    """
 | 
					# fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    doc = obj.doc  # Ensure works on both Doc and Span.
 | 
					
 | 
				
			||||||
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
 | 
				
			||||||
 | 
					    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
				
			||||||
 | 
					    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
					    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
				
			||||||
    doc.vocab.strings.add("conj")
 | 
					    doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
    seen = set()
 | 
					    seen = set()
 | 
				
			||||||
    for i, word in enumerate(obj):
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
					        if word.pos not in (NOUN, PROPN, PRON):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        # Prevent nested chunks from being produced
 | 
					        # Prevent nested chunks from being produced
 | 
				
			||||||
| 
						 | 
					@ -37,12 +27,10 @@ def noun_chunks(obj):
 | 
				
			||||||
            unseen = [w.i for w in word.subtree if w.i not in seen]
 | 
					            unseen = [w.i for w in word.subtree if w.i not in seen]
 | 
				
			||||||
            if not unseen:
 | 
					            if not unseen:
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
 | 
					 | 
				
			||||||
            # this takes care of particles etc.
 | 
					            # this takes care of particles etc.
 | 
				
			||||||
            seen.update(j.i for j in word.subtree)
 | 
					            seen.update(j.i for j in word.subtree)
 | 
				
			||||||
            # This avoids duplicating embedded clauses
 | 
					            # This avoids duplicating embedded clauses
 | 
				
			||||||
            seen.update(range(word.i + 1))
 | 
					            seen.update(range(word.i + 1))
 | 
				
			||||||
 | 
					 | 
				
			||||||
            # if the head of this is a verb, mark that and rights seen
 | 
					            # if the head of this is a verb, mark that and rights seen
 | 
				
			||||||
            # Don't do the subtree as that can hide other phrases
 | 
					            # Don't do the subtree as that can hide other phrases
 | 
				
			||||||
            if word.head.pos == VERB:
 | 
					            if word.head.pos == VERB:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class KannadaDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "kn"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.kn.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.kn.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Kannada(Language):
 | 
					class Kannada(Language):
 | 
				
			||||||
    lang = "kn"
 | 
					    lang = "kn"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = KannadaDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Kannada"]
 | 
					__all__ = ["Kannada"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,9 @@
 | 
				
			||||||
from typing import Set, Optional, Any, Dict
 | 
					from typing import Optional, Any, Dict
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tag_map import TAG_MAP
 | 
					from .tag_map import TAG_MAP
 | 
				
			||||||
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from ...compat import copy_reg
 | 
					from ...compat import copy_reg
 | 
				
			||||||
| 
						 | 
					@ -11,26 +12,14 @@ from ...util import DummyTokenizer, registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "ko"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.ko.stop_words"}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.tokenizer]
 | 
					[nlp.tokenizer]
 | 
				
			||||||
@tokenizers = "spacy.KoreanTokenizer.v1"
 | 
					@tokenizers = "spacy.ko.KoreanTokenizer"
 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.writing_system]
 | 
					 | 
				
			||||||
direction = "ltr"
 | 
					 | 
				
			||||||
has_case = false
 | 
					 | 
				
			||||||
has_letters = false
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.ko.stop_words")
 | 
					@registry.tokenizers("spacy.ko.KoreanTokenizer")
 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					def create_tokenizer():
 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.tokenizers("spacy.KoreanTokenizer.v1")
 | 
					 | 
				
			||||||
def create_korean_tokenizer():
 | 
					 | 
				
			||||||
    def korean_tokenizer_factory(nlp):
 | 
					    def korean_tokenizer_factory(nlp):
 | 
				
			||||||
        return KoreanTokenizer(nlp)
 | 
					        return KoreanTokenizer(nlp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -40,6 +29,8 @@ def create_korean_tokenizer():
 | 
				
			||||||
class KoreanTokenizer(DummyTokenizer):
 | 
					class KoreanTokenizer(DummyTokenizer):
 | 
				
			||||||
    def __init__(self, nlp: Optional[Language] = None):
 | 
					    def __init__(self, nlp: Optional[Language] = None):
 | 
				
			||||||
        self.vocab = nlp.vocab
 | 
					        self.vocab = nlp.vocab
 | 
				
			||||||
 | 
					        # TODO: is this the right way to do it?
 | 
				
			||||||
 | 
					        self.vocab.morphology.load_tag_map(TAG_MAP)
 | 
				
			||||||
        MeCab = try_mecab_import()
 | 
					        MeCab = try_mecab_import()
 | 
				
			||||||
        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
 | 
					        self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -73,13 +64,15 @@ class KoreanTokenizer(DummyTokenizer):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class KoreanDefaults(Language.Defaults):
 | 
					class KoreanDefaults(Language.Defaults):
 | 
				
			||||||
    tag_map = TAG_MAP
 | 
					    config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Korean(Language):
 | 
					class Korean(Language):
 | 
				
			||||||
    lang = "ko"
 | 
					    lang = "ko"
 | 
				
			||||||
    Defaults = KoreanDefaults
 | 
					    Defaults = KoreanDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def try_mecab_import() -> None:
 | 
					def try_mecab_import() -> None:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,49 +1,20 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "lb"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.lb.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.lb.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.lb.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LuxembourgishDefaults(Language.Defaults):
 | 
					class LuxembourgishDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Luxembourgish(Language):
 | 
					class Luxembourgish(Language):
 | 
				
			||||||
    lang = "lb"
 | 
					    lang = "lb"
 | 
				
			||||||
    Defaults = LuxembourgishDefaults
 | 
					    Defaults = LuxembourgishDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Luxembourgish"]
 | 
					__all__ = ["Luxembourgish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,7 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA, NORM
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# TODO
 | 
					# TODO
 | 
				
			||||||
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
 | 
					# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
 | 
				
			||||||
| 
						 | 
					@ -7,19 +10,19 @@ _exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# translate / delete what is not necessary
 | 
					# translate / delete what is not necessary
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "’t", LEMMA: "et", NORM: "et"},
 | 
					    {ORTH: "’t", NORM: "et"},
 | 
				
			||||||
    {ORTH: "’T", LEMMA: "et", NORM: "et"},
 | 
					    {ORTH: "’T", NORM: "et"},
 | 
				
			||||||
    {ORTH: "'t", LEMMA: "et", NORM: "et"},
 | 
					    {ORTH: "'t", NORM: "et"},
 | 
				
			||||||
    {ORTH: "'T", LEMMA: "et", NORM: "et"},
 | 
					    {ORTH: "'T", NORM: "et"},
 | 
				
			||||||
    {ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},
 | 
					    {ORTH: "wgl.", NORM: "wannechgelift"},
 | 
				
			||||||
    {ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
 | 
					    {ORTH: "M.", NORM: "Monsieur"},
 | 
				
			||||||
    {ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
 | 
					    {ORTH: "Mme.", NORM: "Madame"},
 | 
				
			||||||
    {ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
 | 
					    {ORTH: "Dr.", NORM: "Dokter"},
 | 
				
			||||||
    {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
 | 
					    {ORTH: "Tel.", NORM: "Telefon"},
 | 
				
			||||||
    {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
 | 
					    {ORTH: "asw.", NORM: "an sou weider"},
 | 
				
			||||||
    {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
 | 
					    {ORTH: "etc.", NORM: "et cetera"},
 | 
				
			||||||
    {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
 | 
					    {ORTH: "bzw.", NORM: "bezéiungsweis"},
 | 
				
			||||||
    {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
 | 
					    {ORTH: "Jan.", NORM: "Januar"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -47,4 +50,4 @@ for orth in [
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,35 +1,18 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "lij"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.lij.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.lij.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LigurianDefaults(Language.Defaults):
 | 
					class LigurianDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Ligurian(Language):
 | 
					class Ligurian(Language):
 | 
				
			||||||
    lang = "lij"
 | 
					    lang = "lij"
 | 
				
			||||||
    Defaults = LigurianDefaults
 | 
					    Defaults = LigurianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Ligurian"]
 | 
					__all__ = ["Ligurian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,50 +1,50 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for raw, lemma in [
 | 
					for raw in [
 | 
				
			||||||
    ("a-a", "a-o"),
 | 
					    "a-e",
 | 
				
			||||||
    ("a-e", "a-o"),
 | 
					    "a-o",
 | 
				
			||||||
    ("a-o", "a-o"),
 | 
					    "a-i",
 | 
				
			||||||
    ("a-i", "a-o"),
 | 
					    "a-a",
 | 
				
			||||||
    ("co-a", "co-o"),
 | 
					    "co-a",
 | 
				
			||||||
    ("co-e", "co-o"),
 | 
					    "co-e",
 | 
				
			||||||
    ("co-i", "co-o"),
 | 
					    "co-i",
 | 
				
			||||||
    ("co-o", "co-o"),
 | 
					    "co-o",
 | 
				
			||||||
    ("da-a", "da-o"),
 | 
					    "da-a",
 | 
				
			||||||
    ("da-e", "da-o"),
 | 
					    "da-e",
 | 
				
			||||||
    ("da-i", "da-o"),
 | 
					    "da-i",
 | 
				
			||||||
    ("da-o", "da-o"),
 | 
					    "da-o",
 | 
				
			||||||
    ("pe-a", "pe-o"),
 | 
					    "pe-a",
 | 
				
			||||||
    ("pe-e", "pe-o"),
 | 
					    "pe-e",
 | 
				
			||||||
    ("pe-i", "pe-o"),
 | 
					    "pe-i",
 | 
				
			||||||
    ("pe-o", "pe-o"),
 | 
					    "pe-o",
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    for orth in [raw, raw.capitalize()]:
 | 
					    for orth in [raw, raw.capitalize()]:
 | 
				
			||||||
        _exc[orth] = [{ORTH: orth, LEMMA: lemma}]
 | 
					        _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Prefix + prepositions with à (e.g. "sott'a-o")
 | 
					# Prefix + prepositions with à (e.g. "sott'a-o")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for prep, prep_lemma in [
 | 
					for prep in [
 | 
				
			||||||
    ("a-a", "a-o"),
 | 
					    "a-a",
 | 
				
			||||||
    ("a-e", "a-o"),
 | 
					    "a-e",
 | 
				
			||||||
    ("a-o", "a-o"),
 | 
					    "a-o",
 | 
				
			||||||
    ("a-i", "a-o"),
 | 
					    "a-i",
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    for prefix, prefix_lemma in [
 | 
					    for prefix in [
 | 
				
			||||||
        ("sott'", "sotta"),
 | 
					        "sott'",
 | 
				
			||||||
        ("sott’", "sotta"),
 | 
					        "sott’",
 | 
				
			||||||
        ("contr'", "contra"),
 | 
					        "contr'",
 | 
				
			||||||
        ("contr’", "contra"),
 | 
					        "contr’",
 | 
				
			||||||
        ("ch'", "che"),
 | 
					        "ch'",
 | 
				
			||||||
        ("ch’", "che"),
 | 
					        "ch’",
 | 
				
			||||||
        ("s'", "se"),
 | 
					        "s'",
 | 
				
			||||||
        ("s’", "se"),
 | 
					        "s’",
 | 
				
			||||||
    ]:
 | 
					    ]:
 | 
				
			||||||
        for prefix_orth in [prefix, prefix.capitalize()]:
 | 
					        for prefix_orth in [prefix, prefix.capitalize()]:
 | 
				
			||||||
            _exc[prefix_orth + prep] = [
 | 
					            _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
 | 
				
			||||||
                {ORTH: prefix_orth, LEMMA: prefix_lemma},
 | 
					 | 
				
			||||||
                {ORTH: prep, LEMMA: prep_lemma},
 | 
					 | 
				
			||||||
            ]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,54 +1,21 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "lt"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.lt.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.lt.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.lt.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LithuanianDefaults(Language.Defaults):
 | 
					class LithuanianDefaults(Language.Defaults):
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    mod_base_exceptions = {
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
        exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
    }
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
    del mod_base_exceptions["8)"]
 | 
					 | 
				
			||||||
    tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Lithuanian(Language):
 | 
					class Lithuanian(Language):
 | 
				
			||||||
    lang = "lt"
 | 
					    lang = "lt"
 | 
				
			||||||
    Defaults = LithuanianDefaults
 | 
					    Defaults = LithuanianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Lithuanian"]
 | 
					__all__ = ["Lithuanian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,267 +1,15 @@
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...symbols import ORTH
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for orth in [
 | 
					for orth in ["n-tosios", "?!"]:
 | 
				
			||||||
    "n-tosios",
 | 
					 | 
				
			||||||
    "?!",
 | 
					 | 
				
			||||||
    #    "G.",
 | 
					 | 
				
			||||||
    #    "J. E.",
 | 
					 | 
				
			||||||
    #    "J. Em.",
 | 
					 | 
				
			||||||
    #    "J.E.",
 | 
					 | 
				
			||||||
    #    "J.Em.",
 | 
					 | 
				
			||||||
    #    "K.",
 | 
					 | 
				
			||||||
    #    "N.",
 | 
					 | 
				
			||||||
    #    "V.",
 | 
					 | 
				
			||||||
    #    "Vt.",
 | 
					 | 
				
			||||||
    #    "a.",
 | 
					 | 
				
			||||||
    #    "a.k.",
 | 
					 | 
				
			||||||
    #    "a.s.",
 | 
					 | 
				
			||||||
    #    "adv.",
 | 
					 | 
				
			||||||
    #    "akad.",
 | 
					 | 
				
			||||||
    #    "aklg.",
 | 
					 | 
				
			||||||
    #    "akt.",
 | 
					 | 
				
			||||||
    #    "al.",
 | 
					 | 
				
			||||||
    #    "ang.",
 | 
					 | 
				
			||||||
    #    "angl.",
 | 
					 | 
				
			||||||
    #    "aps.",
 | 
					 | 
				
			||||||
    #    "apskr.",
 | 
					 | 
				
			||||||
    #    "apyg.",
 | 
					 | 
				
			||||||
    #    "arbat.",
 | 
					 | 
				
			||||||
    #    "asist.",
 | 
					 | 
				
			||||||
    #    "asm.",
 | 
					 | 
				
			||||||
    #    "asm.k.",
 | 
					 | 
				
			||||||
    #    "asmv.",
 | 
					 | 
				
			||||||
    #    "atk.",
 | 
					 | 
				
			||||||
    #    "atsak.",
 | 
					 | 
				
			||||||
    #    "atsisk.",
 | 
					 | 
				
			||||||
    #    "atsisk.sąsk.",
 | 
					 | 
				
			||||||
    #    "atv.",
 | 
					 | 
				
			||||||
    #    "aut.",
 | 
					 | 
				
			||||||
    #    "avd.",
 | 
					 | 
				
			||||||
    #    "b.k.",
 | 
					 | 
				
			||||||
    #    "baud.",
 | 
					 | 
				
			||||||
    #    "biol.",
 | 
					 | 
				
			||||||
    #    "bkl.",
 | 
					 | 
				
			||||||
    #    "bot.",
 | 
					 | 
				
			||||||
    #    "bt.",
 | 
					 | 
				
			||||||
    #    "buv.",
 | 
					 | 
				
			||||||
    #    "ch.",
 | 
					 | 
				
			||||||
    #    "chem.",
 | 
					 | 
				
			||||||
    #    "corp.",
 | 
					 | 
				
			||||||
    #    "d.",
 | 
					 | 
				
			||||||
    #    "dab.",
 | 
					 | 
				
			||||||
    #    "dail.",
 | 
					 | 
				
			||||||
    #    "dek.",
 | 
					 | 
				
			||||||
    #    "deš.",
 | 
					 | 
				
			||||||
    #    "dir.",
 | 
					 | 
				
			||||||
    #    "dirig.",
 | 
					 | 
				
			||||||
    #    "doc.",
 | 
					 | 
				
			||||||
    #    "dol.",
 | 
					 | 
				
			||||||
    #    "dr.",
 | 
					 | 
				
			||||||
    #    "drp.",
 | 
					 | 
				
			||||||
    #    "dvit.",
 | 
					 | 
				
			||||||
    #    "dėst.",
 | 
					 | 
				
			||||||
    #    "dš.",
 | 
					 | 
				
			||||||
    #    "dž.",
 | 
					 | 
				
			||||||
    #    "e.b.",
 | 
					 | 
				
			||||||
    #    "e.bankas",
 | 
					 | 
				
			||||||
    #    "e.p.",
 | 
					 | 
				
			||||||
    #    "e.parašas",
 | 
					 | 
				
			||||||
    #    "e.paštas",
 | 
					 | 
				
			||||||
    #    "e.v.",
 | 
					 | 
				
			||||||
    #    "e.valdžia",
 | 
					 | 
				
			||||||
    #    "egz.",
 | 
					 | 
				
			||||||
    #    "eil.",
 | 
					 | 
				
			||||||
    #    "ekon.",
 | 
					 | 
				
			||||||
    #    "el.",
 | 
					 | 
				
			||||||
    #    "el.bankas",
 | 
					 | 
				
			||||||
    #    "el.p.",
 | 
					 | 
				
			||||||
    #    "el.parašas",
 | 
					 | 
				
			||||||
    #    "el.paštas",
 | 
					 | 
				
			||||||
    #    "el.valdžia",
 | 
					 | 
				
			||||||
    #    "etc.",
 | 
					 | 
				
			||||||
    #    "ež.",
 | 
					 | 
				
			||||||
    #    "fak.",
 | 
					 | 
				
			||||||
    #    "faks.",
 | 
					 | 
				
			||||||
    #    "feat.",
 | 
					 | 
				
			||||||
    #    "filol.",
 | 
					 | 
				
			||||||
    #    "filos.",
 | 
					 | 
				
			||||||
    #    "g.",
 | 
					 | 
				
			||||||
    #    "gen.",
 | 
					 | 
				
			||||||
    #    "geol.",
 | 
					 | 
				
			||||||
    #    "gerb.",
 | 
					 | 
				
			||||||
    #    "gim.",
 | 
					 | 
				
			||||||
    #    "gr.",
 | 
					 | 
				
			||||||
    #    "gv.",
 | 
					 | 
				
			||||||
    #    "gyd.",
 | 
					 | 
				
			||||||
    #    "gyv.",
 | 
					 | 
				
			||||||
    #    "habil.",
 | 
					 | 
				
			||||||
    #    "inc.",
 | 
					 | 
				
			||||||
    #    "insp.",
 | 
					 | 
				
			||||||
    #    "inž.",
 | 
					 | 
				
			||||||
    #    "ir pan.",
 | 
					 | 
				
			||||||
    #    "ir t. t.",
 | 
					 | 
				
			||||||
    #    "isp.",
 | 
					 | 
				
			||||||
    #    "istor.",
 | 
					 | 
				
			||||||
    #    "it.",
 | 
					 | 
				
			||||||
    #    "just.",
 | 
					 | 
				
			||||||
    #    "k.",
 | 
					 | 
				
			||||||
    #    "k. a.",
 | 
					 | 
				
			||||||
    #    "k.a.",
 | 
					 | 
				
			||||||
    #    "kab.",
 | 
					 | 
				
			||||||
    #    "kand.",
 | 
					 | 
				
			||||||
    #    "kart.",
 | 
					 | 
				
			||||||
    #    "kat.",
 | 
					 | 
				
			||||||
    #    "ketv.",
 | 
					 | 
				
			||||||
    #    "kh.",
 | 
					 | 
				
			||||||
    #    "kl.",
 | 
					 | 
				
			||||||
    #    "kln.",
 | 
					 | 
				
			||||||
    #    "km.",
 | 
					 | 
				
			||||||
    #    "kn.",
 | 
					 | 
				
			||||||
    #    "koresp.",
 | 
					 | 
				
			||||||
    #    "kpt.",
 | 
					 | 
				
			||||||
    #    "kr.",
 | 
					 | 
				
			||||||
    #    "kt.",
 | 
					 | 
				
			||||||
    #    "kub.",
 | 
					 | 
				
			||||||
    #    "kun.",
 | 
					 | 
				
			||||||
    #    "kv.",
 | 
					 | 
				
			||||||
    #    "kyš.",
 | 
					 | 
				
			||||||
    #    "l. e. p.",
 | 
					 | 
				
			||||||
    #    "l.e.p.",
 | 
					 | 
				
			||||||
    #    "lenk.",
 | 
					 | 
				
			||||||
    #    "liet.",
 | 
					 | 
				
			||||||
    #    "lot.",
 | 
					 | 
				
			||||||
    #    "lt.",
 | 
					 | 
				
			||||||
    #    "ltd.",
 | 
					 | 
				
			||||||
    #    "ltn.",
 | 
					 | 
				
			||||||
    #    "m.",
 | 
					 | 
				
			||||||
    #    "m.e..",
 | 
					 | 
				
			||||||
    #    "m.m.",
 | 
					 | 
				
			||||||
    #    "mat.",
 | 
					 | 
				
			||||||
    #    "med.",
 | 
					 | 
				
			||||||
    #    "mgnt.",
 | 
					 | 
				
			||||||
    #    "mgr.",
 | 
					 | 
				
			||||||
    #    "min.",
 | 
					 | 
				
			||||||
    #    "mjr.",
 | 
					 | 
				
			||||||
    #    "ml.",
 | 
					 | 
				
			||||||
    #    "mln.",
 | 
					 | 
				
			||||||
    #    "mlrd.",
 | 
					 | 
				
			||||||
    #    "mob.",
 | 
					 | 
				
			||||||
    #    "mok.",
 | 
					 | 
				
			||||||
    #    "moksl.",
 | 
					 | 
				
			||||||
    #    "mokyt.",
 | 
					 | 
				
			||||||
    #    "mot.",
 | 
					 | 
				
			||||||
    #    "mr.",
 | 
					 | 
				
			||||||
    #    "mst.",
 | 
					 | 
				
			||||||
    #    "mstl.",
 | 
					 | 
				
			||||||
    #    "mėn.",
 | 
					 | 
				
			||||||
    #    "nkt.",
 | 
					 | 
				
			||||||
    #    "no.",
 | 
					 | 
				
			||||||
    #    "nr.",
 | 
					 | 
				
			||||||
    #    "ntk.",
 | 
					 | 
				
			||||||
    #    "nuotr.",
 | 
					 | 
				
			||||||
    #    "op.",
 | 
					 | 
				
			||||||
    #    "org.",
 | 
					 | 
				
			||||||
    #    "orig.",
 | 
					 | 
				
			||||||
    #    "p.",
 | 
					 | 
				
			||||||
    #    "p.d.",
 | 
					 | 
				
			||||||
    #    "p.m.e.",
 | 
					 | 
				
			||||||
    #    "p.s.",
 | 
					 | 
				
			||||||
    #    "pab.",
 | 
					 | 
				
			||||||
    #    "pan.",
 | 
					 | 
				
			||||||
    #    "past.",
 | 
					 | 
				
			||||||
    #    "pav.",
 | 
					 | 
				
			||||||
    #    "pavad.",
 | 
					 | 
				
			||||||
    #    "per.",
 | 
					 | 
				
			||||||
    #    "perd.",
 | 
					 | 
				
			||||||
    #    "pirm.",
 | 
					 | 
				
			||||||
    #    "pl.",
 | 
					 | 
				
			||||||
    #    "plg.",
 | 
					 | 
				
			||||||
    #    "plk.",
 | 
					 | 
				
			||||||
    #    "pr.",
 | 
					 | 
				
			||||||
    #    "pr.Kr.",
 | 
					 | 
				
			||||||
    #    "pranc.",
 | 
					 | 
				
			||||||
    #    "proc.",
 | 
					 | 
				
			||||||
    #    "prof.",
 | 
					 | 
				
			||||||
    #    "prom.",
 | 
					 | 
				
			||||||
    #    "prot.",
 | 
					 | 
				
			||||||
    #    "psl.",
 | 
					 | 
				
			||||||
    #    "pss.",
 | 
					 | 
				
			||||||
    #    "pvz.",
 | 
					 | 
				
			||||||
    #    "pšt.",
 | 
					 | 
				
			||||||
    #    "r.",
 | 
					 | 
				
			||||||
    #    "raj.",
 | 
					 | 
				
			||||||
    #    "red.",
 | 
					 | 
				
			||||||
    #    "rez.",
 | 
					 | 
				
			||||||
    #    "rež.",
 | 
					 | 
				
			||||||
    #    "rus.",
 | 
					 | 
				
			||||||
    #    "rš.",
 | 
					 | 
				
			||||||
    #    "s.",
 | 
					 | 
				
			||||||
    #    "sav.",
 | 
					 | 
				
			||||||
    #    "saviv.",
 | 
					 | 
				
			||||||
    #    "sek.",
 | 
					 | 
				
			||||||
    #    "sekr.",
 | 
					 | 
				
			||||||
    #    "sen.",
 | 
					 | 
				
			||||||
    #    "sh.",
 | 
					 | 
				
			||||||
    #    "sk.",
 | 
					 | 
				
			||||||
    #    "skg.",
 | 
					 | 
				
			||||||
    #    "skv.",
 | 
					 | 
				
			||||||
    #    "skyr.",
 | 
					 | 
				
			||||||
    #    "sp.",
 | 
					 | 
				
			||||||
    #    "spec.",
 | 
					 | 
				
			||||||
    #    "sr.",
 | 
					 | 
				
			||||||
    #    "st.",
 | 
					 | 
				
			||||||
    #    "str.",
 | 
					 | 
				
			||||||
    #    "stud.",
 | 
					 | 
				
			||||||
    #    "sąs.",
 | 
					 | 
				
			||||||
    #    "t.",
 | 
					 | 
				
			||||||
    #    "t. p.",
 | 
					 | 
				
			||||||
    #    "t. y.",
 | 
					 | 
				
			||||||
    #    "t.p.",
 | 
					 | 
				
			||||||
    #    "t.t.",
 | 
					 | 
				
			||||||
    #    "t.y.",
 | 
					 | 
				
			||||||
    #    "techn.",
 | 
					 | 
				
			||||||
    #    "tel.",
 | 
					 | 
				
			||||||
    #    "teol.",
 | 
					 | 
				
			||||||
    #    "th.",
 | 
					 | 
				
			||||||
    #    "tir.",
 | 
					 | 
				
			||||||
    #    "trit.",
 | 
					 | 
				
			||||||
    #    "trln.",
 | 
					 | 
				
			||||||
    #    "tšk.",
 | 
					 | 
				
			||||||
    #    "tūks.",
 | 
					 | 
				
			||||||
    #    "tūkst.",
 | 
					 | 
				
			||||||
    #    "up.",
 | 
					 | 
				
			||||||
    #    "upl.",
 | 
					 | 
				
			||||||
    #    "v.s.",
 | 
					 | 
				
			||||||
    #    "vad.",
 | 
					 | 
				
			||||||
    #    "val.",
 | 
					 | 
				
			||||||
    #    "valg.",
 | 
					 | 
				
			||||||
    #    "ved.",
 | 
					 | 
				
			||||||
    #    "vert.",
 | 
					 | 
				
			||||||
    #    "vet.",
 | 
					 | 
				
			||||||
    #    "vid.",
 | 
					 | 
				
			||||||
    #    "virš.",
 | 
					 | 
				
			||||||
    #    "vlsč.",
 | 
					 | 
				
			||||||
    #    "vnt.",
 | 
					 | 
				
			||||||
    #    "vok.",
 | 
					 | 
				
			||||||
    #    "vs.",
 | 
					 | 
				
			||||||
    #    "vtv.",
 | 
					 | 
				
			||||||
    #    "vv.",
 | 
					 | 
				
			||||||
    #    "vyr.",
 | 
					 | 
				
			||||||
    #    "vyresn.",
 | 
					 | 
				
			||||||
    #    "zool.",
 | 
					 | 
				
			||||||
    #    "Įn",
 | 
					 | 
				
			||||||
    #    "įl.",
 | 
					 | 
				
			||||||
    #    "š.m.",
 | 
					 | 
				
			||||||
    #    "šnek.",
 | 
					 | 
				
			||||||
    #    "šv.",
 | 
					 | 
				
			||||||
    #    "švč.",
 | 
					 | 
				
			||||||
    #    "ž.ū.",
 | 
					 | 
				
			||||||
    #    "žin.",
 | 
					 | 
				
			||||||
    #    "žml.",
 | 
					 | 
				
			||||||
    #    "žr.",
 | 
					 | 
				
			||||||
]:
 | 
					 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					mod_base_exceptions = {
 | 
				
			||||||
 | 
					    exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					del mod_base_exceptions["8)"]
 | 
				
			||||||
 | 
					TOKENIZER_EXCEPTIONS = update_exc(mod_base_exceptions, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class LatvianDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "lv"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.lv.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.lv.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Latvian(Language):
 | 
					class Latvian(Language):
 | 
				
			||||||
    lang = "lv"
 | 
					    lang = "lv"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = LatvianDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Latvian"]
 | 
					__all__ = ["Latvian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,16 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class MalayalamDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
lang = "ml"
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
stop_words = {"@language_data": "spacy.ml.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ml.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Malayalam(Language):
 | 
					class Malayalam(Language):
 | 
				
			||||||
    lang = "ml"
 | 
					    lang = "ml"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = MalayalamDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Malayalam"]
 | 
					__all__ = ["Malayalam"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class MarathiDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "af"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.mr.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.mr.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Marathi(Language):
 | 
					class Marathi(Language):
 | 
				
			||||||
    lang = "mr"
 | 
					    lang = "mr"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = MarathiDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Marathi"]
 | 
					__all__ = ["Marathi"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,47 +1,23 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "nb"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.nb.stop_words"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.nb.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class NorwegianDefaults(Language.Defaults):
 | 
					class NorwegianDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Norwegian(Language):
 | 
					class Norwegian(Language):
 | 
				
			||||||
    lang = "nb"
 | 
					    lang = "nb"
 | 
				
			||||||
    Defaults = NorwegianDefaults
 | 
					    Defaults = NorwegianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Norwegian"]
 | 
					__all__ = ["Norwegian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,18 @@
 | 
				
			||||||
 | 
					from typing import Union, Iterator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					from ...symbols import NOUN, PROPN, PRON
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike):
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
 | 
				
			||||||
    """
 | 
					    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
				
			||||||
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
 | 
					    # fmt: off
 | 
				
			||||||
    """
 | 
					    labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
 | 
				
			||||||
    labels = [
 | 
					    # fmt: on
 | 
				
			||||||
        "nsubj",
 | 
					 | 
				
			||||||
        "nsubj:pass",
 | 
					 | 
				
			||||||
        "obj",
 | 
					 | 
				
			||||||
        "iobj",
 | 
					 | 
				
			||||||
        "ROOT",
 | 
					 | 
				
			||||||
        "appos",
 | 
					 | 
				
			||||||
        "nmod",
 | 
					 | 
				
			||||||
        "nmod:poss",
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
					    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not doc.is_parsed:
 | 
					    if not doc.is_parsed:
 | 
				
			||||||
        raise ValueError(Errors.E029)
 | 
					        raise ValueError(Errors.E029)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
					    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,21 +1,23 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "jan.", LEMMA: "januar"},
 | 
					    {ORTH: "jan.", NORM: "januar"},
 | 
				
			||||||
    {ORTH: "feb.", LEMMA: "februar"},
 | 
					    {ORTH: "feb.", NORM: "februar"},
 | 
				
			||||||
    {ORTH: "mar.", LEMMA: "mars"},
 | 
					    {ORTH: "mar.", NORM: "mars"},
 | 
				
			||||||
    {ORTH: "apr.", LEMMA: "april"},
 | 
					    {ORTH: "apr.", NORM: "april"},
 | 
				
			||||||
    {ORTH: "jun.", LEMMA: "juni"},
 | 
					    {ORTH: "jun.", NORM: "juni"},
 | 
				
			||||||
    {ORTH: "jul.", LEMMA: "juli"},
 | 
					    {ORTH: "jul.", NORM: "juli"},
 | 
				
			||||||
    {ORTH: "aug.", LEMMA: "august"},
 | 
					    {ORTH: "aug.", NORM: "august"},
 | 
				
			||||||
    {ORTH: "sep.", LEMMA: "september"},
 | 
					    {ORTH: "sep.", NORM: "september"},
 | 
				
			||||||
    {ORTH: "okt.", LEMMA: "oktober"},
 | 
					    {ORTH: "okt.", NORM: "oktober"},
 | 
				
			||||||
    {ORTH: "nov.", LEMMA: "november"},
 | 
					    {ORTH: "nov.", NORM: "november"},
 | 
				
			||||||
    {ORTH: "des.", LEMMA: "desember"},
 | 
					    {ORTH: "des.", NORM: "desember"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -218,4 +220,4 @@ for orth in [
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,16 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class NepaliDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "ne"
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
stop_words = {"@language_data": "spacy.ne.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ne.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ne.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Nepali(Language):
 | 
					class Nepali(Language):
 | 
				
			||||||
    lang = "ne"
 | 
					    lang = "ne"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = NepaliDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Nepali"]
 | 
					__all__ = ["Nepali"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					from ..norm_exceptions import BASE_NORMS
 | 
				
			||||||
from ...attrs import NORM, LIKE_NUM
 | 
					from ...attrs import NORM, LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					from typing import Callable
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
| 
						 | 
					@ -7,52 +7,43 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from .lemmatizer import DutchLemmatizer
 | 
					from .lemmatizer import DutchLemmatizer
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...lookups import load_lookups
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "nl"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.nl.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.lemmatizer]
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
@lemmatizers = "spacy.DutchLemmatizer.v1"
 | 
					@lemmatizers = "spacy.nl.DutchLemmatizer"
 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.nl.stop_words")
 | 
					@registry.lemmatizers("spacy.nl.DutchLemmatizer")
 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]:
 | 
				
			||||||
    return STOP_WORDS
 | 
					    tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def lemmatizer_factory(nlp: Language) -> DutchLemmatizer:
 | 
				
			||||||
 | 
					        lookups = load_lookups(lang=nlp.lang, tables=tables)
 | 
				
			||||||
 | 
					        return DutchLemmatizer(lookups=lookups)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.nl.lex_attr_getters")
 | 
					    return lemmatizer_factory
 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
 | 
					 | 
				
			||||||
def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
 | 
					 | 
				
			||||||
    return DutchLemmatizer(data_paths=data_paths)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DutchDefaults(Language.Defaults):
 | 
					class DutchDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Dutch(Language):
 | 
					class Dutch(Language):
 | 
				
			||||||
    lang = "nl"
 | 
					    lang = "nl"
 | 
				
			||||||
    Defaults = DutchDefaults
 | 
					    Defaults = DutchDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Dutch"]
 | 
					__all__ = ["Dutch"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,7 @@
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...symbols import ORTH
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Extensive list of both common and uncommon dutch abbreviations copied from
 | 
					# Extensive list of both common and uncommon dutch abbreviations copied from
 | 
				
			||||||
# github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
 | 
					# github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
 | 
				
			||||||
| 
						 | 
					@ -1602,4 +1605,4 @@ for orth in abbrevs:
 | 
				
			||||||
        _exc[i] = [{ORTH: i}]
 | 
					        _exc[i] = [{ORTH: i}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					from typing import Callable
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
| 
						 | 
					@ -7,54 +7,53 @@ from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .lemmatizer import PolishLemmatizer
 | 
					from .lemmatizer import PolishLemmatizer
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...lookups import load_lookups
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "pl"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.pl.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.lemmatizer]
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
@lemmatizers = "spacy.PolishLemmatizer.v1"
 | 
					@lemmatizers = "spacy.pl.PolishLemmatizer"
 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TOKENIZER_EXCEPTIONS = {
 | 
				
			||||||
@registry.language_data("spacy.pl.stop_words")
 | 
					    exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					}
 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.pl.lex_attr_getters")
 | 
					@registry.lemmatizers("spacy.pl.PolishLemmatizer")
 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]:
 | 
				
			||||||
    return LEX_ATTRS
 | 
					    # fmt: off
 | 
				
			||||||
 | 
					    tables = [
 | 
				
			||||||
 | 
					        "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
 | 
				
			||||||
 | 
					        "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
 | 
				
			||||||
 | 
					        "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def lemmatizer_factory(nlp: Language) -> PolishLemmatizer:
 | 
				
			||||||
 | 
					        lookups = load_lookups(lang=nlp.lang, tables=tables)
 | 
				
			||||||
 | 
					        return PolishLemmatizer(lookups=lookups)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
 | 
					    return lemmatizer_factory
 | 
				
			||||||
def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
 | 
					 | 
				
			||||||
    return PolishLemmatizer(data_paths=data_paths)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PolishDefaults(Language.Defaults):
 | 
					class PolishDefaults(Language.Defaults):
 | 
				
			||||||
    mod_base_exceptions = {
 | 
					    config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
        exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    tokenizer_exceptions = mod_base_exceptions
 | 
					 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Polish(Language):
 | 
					class Polish(Language):
 | 
				
			||||||
    lang = "pl"
 | 
					    lang = "pl"
 | 
				
			||||||
    Defaults = PolishDefaults
 | 
					    Defaults = PolishDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Polish"]
 | 
					__all__ = ["Polish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,50 +1,21 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "pt"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.pt.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.pt.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.pt.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PortugueseDefaults(Language.Defaults):
 | 
					class PortugueseDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Portuguese(Language):
 | 
					class Portuguese(Language):
 | 
				
			||||||
    lang = "pt"
 | 
					    lang = "pt"
 | 
				
			||||||
    Defaults = PortugueseDefaults
 | 
					    Defaults = PortugueseDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Portuguese"]
 | 
					__all__ = ["Portuguese"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,6 @@
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...symbols import ORTH
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
| 
						 | 
					@ -50,4 +52,4 @@ for orth in [
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@ from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
 | 
				
			||||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 | 
					from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_prefixes = (
 | 
					TOKENIZER_PREFIXES = (
 | 
				
			||||||
    ["§", "%", "=", "—", "–", r"\+(?![0-9])"]
 | 
					    ["§", "%", "=", "—", "–", r"\+(?![0-9])"]
 | 
				
			||||||
    + LIST_PUNCT
 | 
					    + LIST_PUNCT
 | 
				
			||||||
    + LIST_ELLIPSES
 | 
					    + LIST_ELLIPSES
 | 
				
			||||||
| 
						 | 
					@ -13,7 +13,7 @@ _prefixes = (
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_suffixes = (
 | 
					TOKENIZER_SUFFIXES = (
 | 
				
			||||||
    LIST_PUNCT
 | 
					    LIST_PUNCT
 | 
				
			||||||
    + LIST_ELLIPSES
 | 
					    + LIST_ELLIPSES
 | 
				
			||||||
    + LIST_QUOTES
 | 
					    + LIST_QUOTES
 | 
				
			||||||
| 
						 | 
					@ -31,7 +31,7 @@ _suffixes = (
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_infixes = (
 | 
					TOKENIZER_INFIXES = (
 | 
				
			||||||
    LIST_ELLIPSES
 | 
					    LIST_ELLIPSES
 | 
				
			||||||
    + LIST_ICONS
 | 
					    + LIST_ICONS
 | 
				
			||||||
    + [
 | 
					    + [
 | 
				
			||||||
| 
						 | 
					@ -44,7 +44,3 @@ _infixes = (
 | 
				
			||||||
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
 | 
					        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					 | 
				
			||||||
TOKENIZER_PREFIXES = _prefixes
 | 
					 | 
				
			||||||
TOKENIZER_SUFFIXES = _suffixes
 | 
					 | 
				
			||||||
TOKENIZER_INFIXES = _infixes
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,49 +1,27 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Lemma data note:
 | 
					# Lemma data note:
 | 
				
			||||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
 | 
					# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
 | 
				
			||||||
# Replaced characters using cedillas with the correct ones (ș and ț)
 | 
					# Replaced characters using cedillas with the correct ones (ș and ț)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "ro"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.ro.stop_words"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ro.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class RomanianDefaults(Language.Defaults):
 | 
					class RomanianDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    prefixes = TOKENIZER_PREFIXES
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Romanian(Language):
 | 
					class Romanian(Language):
 | 
				
			||||||
    lang = "ro"
 | 
					    lang = "ro"
 | 
				
			||||||
    Defaults = RomanianDefaults
 | 
					    Defaults = RomanianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Romanian"]
 | 
					__all__ = ["Romanian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,6 @@
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ...symbols import ORTH
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
from .punctuation import _make_ro_variants
 | 
					from .punctuation import _make_ro_variants
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -91,4 +93,4 @@ for orth in [
 | 
				
			||||||
        _exc[variant] = [{ORTH: variant}]
 | 
					        _exc[variant] = [{ORTH: variant}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,49 +1,40 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					from typing import Callable
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .lemmatizer import RussianLemmatizer
 | 
					from .lemmatizer import RussianLemmatizer
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...util import registry
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "ru"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.ru.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.lemmatizer]
 | 
					[nlp.lemmatizer]
 | 
				
			||||||
@lemmatizers = "spacy.RussianLemmatizer.v1"
 | 
					@lemmatizers = "spacy.ru.RussianLemmatizer"
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.ru.stop_words")
 | 
					@registry.lemmatizers("spacy.ru.RussianLemmatizer")
 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]:
 | 
				
			||||||
    return STOP_WORDS
 | 
					    def lemmatizer_factory(nlp: Language) -> RussianLemmatizer:
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ru.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
 | 
					 | 
				
			||||||
def create_russian_lemmatizer() -> RussianLemmatizer:
 | 
					 | 
				
			||||||
        return RussianLemmatizer()
 | 
					        return RussianLemmatizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return lemmatizer_factory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class RussianDefaults(Language.Defaults):
 | 
					class RussianDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Russian(Language):
 | 
					class Russian(Language):
 | 
				
			||||||
    lang = "ru"
 | 
					    lang = "ru"
 | 
				
			||||||
    Defaults = RussianDefaults
 | 
					    Defaults = RussianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Russian"]
 | 
					__all__ = ["Russian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,66 +1,66 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA, NORM
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_abbrev_exc = [
 | 
					_abbrev_exc = [
 | 
				
			||||||
    # Weekdays abbreviations
 | 
					    # Weekdays abbreviations
 | 
				
			||||||
    {ORTH: "пн", LEMMA: "понедельник", NORM: "понедельник"},
 | 
					    {ORTH: "пн", NORM: "понедельник"},
 | 
				
			||||||
    {ORTH: "вт", LEMMA: "вторник", NORM: "вторник"},
 | 
					    {ORTH: "вт", NORM: "вторник"},
 | 
				
			||||||
    {ORTH: "ср", LEMMA: "среда", NORM: "среда"},
 | 
					    {ORTH: "ср", NORM: "среда"},
 | 
				
			||||||
    {ORTH: "чт", LEMMA: "четверг", NORM: "четверг"},
 | 
					    {ORTH: "чт", NORM: "четверг"},
 | 
				
			||||||
    {ORTH: "чтв", LEMMA: "четверг", NORM: "четверг"},
 | 
					    {ORTH: "чтв", NORM: "четверг"},
 | 
				
			||||||
    {ORTH: "пт", LEMMA: "пятница", NORM: "пятница"},
 | 
					    {ORTH: "пт", NORM: "пятница"},
 | 
				
			||||||
    {ORTH: "сб", LEMMA: "суббота", NORM: "суббота"},
 | 
					    {ORTH: "сб", NORM: "суббота"},
 | 
				
			||||||
    {ORTH: "сбт", LEMMA: "суббота", NORM: "суббота"},
 | 
					    {ORTH: "сбт", NORM: "суббота"},
 | 
				
			||||||
    {ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"},
 | 
					    {ORTH: "вс", NORM: "воскресенье"},
 | 
				
			||||||
    {ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"},
 | 
					    {ORTH: "вскр", NORM: "воскресенье"},
 | 
				
			||||||
    {ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"},
 | 
					    {ORTH: "воскр", NORM: "воскресенье"},
 | 
				
			||||||
    # Months abbreviations
 | 
					    # Months abbreviations
 | 
				
			||||||
    {ORTH: "янв", LEMMA: "январь", NORM: "январь"},
 | 
					    {ORTH: "янв", NORM: "январь"},
 | 
				
			||||||
    {ORTH: "фев", LEMMA: "февраль", NORM: "февраль"},
 | 
					    {ORTH: "фев", NORM: "февраль"},
 | 
				
			||||||
    {ORTH: "февр", LEMMA: "февраль", NORM: "февраль"},
 | 
					    {ORTH: "февр", NORM: "февраль"},
 | 
				
			||||||
    {ORTH: "мар", LEMMA: "март", NORM: "март"},
 | 
					    {ORTH: "мар", NORM: "март"},
 | 
				
			||||||
    # {ORTH: "март", LEMMA: "март", NORM: "март"},
 | 
					    # {ORTH: "март", NORM: "март"},
 | 
				
			||||||
    {ORTH: "мрт", LEMMA: "март", NORM: "март"},
 | 
					    {ORTH: "мрт", NORM: "март"},
 | 
				
			||||||
    {ORTH: "апр", LEMMA: "апрель", NORM: "апрель"},
 | 
					    {ORTH: "апр", NORM: "апрель"},
 | 
				
			||||||
    # {ORTH: "май", LEMMA: "май", NORM: "май"},
 | 
					    # {ORTH: "май", NORM: "май"},
 | 
				
			||||||
    {ORTH: "июн", LEMMA: "июнь", NORM: "июнь"},
 | 
					    {ORTH: "июн", NORM: "июнь"},
 | 
				
			||||||
    # {ORTH: "июнь", LEMMA: "июнь", NORM: "июнь"},
 | 
					    # {ORTH: "июнь", NORM: "июнь"},
 | 
				
			||||||
    {ORTH: "июл", LEMMA: "июль", NORM: "июль"},
 | 
					    {ORTH: "июл", NORM: "июль"},
 | 
				
			||||||
    # {ORTH: "июль", LEMMA: "июль", NORM: "июль"},
 | 
					    # {ORTH: "июль", NORM: "июль"},
 | 
				
			||||||
    {ORTH: "авг", LEMMA: "август", NORM: "август"},
 | 
					    {ORTH: "авг", NORM: "август"},
 | 
				
			||||||
    {ORTH: "сен", LEMMA: "сентябрь", NORM: "сентябрь"},
 | 
					    {ORTH: "сен", NORM: "сентябрь"},
 | 
				
			||||||
    {ORTH: "сент", LEMMA: "сентябрь", NORM: "сентябрь"},
 | 
					    {ORTH: "сент", NORM: "сентябрь"},
 | 
				
			||||||
    {ORTH: "окт", LEMMA: "октябрь", NORM: "октябрь"},
 | 
					    {ORTH: "окт", NORM: "октябрь"},
 | 
				
			||||||
    {ORTH: "октб", LEMMA: "октябрь", NORM: "октябрь"},
 | 
					    {ORTH: "октб", NORM: "октябрь"},
 | 
				
			||||||
    {ORTH: "ноя", LEMMA: "ноябрь", NORM: "ноябрь"},
 | 
					    {ORTH: "ноя", NORM: "ноябрь"},
 | 
				
			||||||
    {ORTH: "нояб", LEMMA: "ноябрь", NORM: "ноябрь"},
 | 
					    {ORTH: "нояб", NORM: "ноябрь"},
 | 
				
			||||||
    {ORTH: "нбр", LEMMA: "ноябрь", NORM: "ноябрь"},
 | 
					    {ORTH: "нбр", NORM: "ноябрь"},
 | 
				
			||||||
    {ORTH: "дек", LEMMA: "декабрь", NORM: "декабрь"},
 | 
					    {ORTH: "дек", NORM: "декабрь"},
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for abbrev_desc in _abbrev_exc:
 | 
					for abbrev_desc in _abbrev_exc:
 | 
				
			||||||
    abbrev = abbrev_desc[ORTH]
 | 
					    abbrev = abbrev_desc[ORTH]
 | 
				
			||||||
    for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
 | 
					    for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
 | 
				
			||||||
        _exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
 | 
					        _exc[orth] = [{ORTH: orth, NORM: abbrev_desc[NORM]}]
 | 
				
			||||||
        _exc[orth + "."] = [
 | 
					        _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
 | 
				
			||||||
            {ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_slang_exc = [
 | 
					_slang_exc = [
 | 
				
			||||||
    {ORTH: "2к15", LEMMA: "2015", NORM: "2015"},
 | 
					    {ORTH: "2к15", NORM: "2015"},
 | 
				
			||||||
    {ORTH: "2к16", LEMMA: "2016", NORM: "2016"},
 | 
					    {ORTH: "2к16", NORM: "2016"},
 | 
				
			||||||
    {ORTH: "2к17", LEMMA: "2017", NORM: "2017"},
 | 
					    {ORTH: "2к17", NORM: "2017"},
 | 
				
			||||||
    {ORTH: "2к18", LEMMA: "2018", NORM: "2018"},
 | 
					    {ORTH: "2к18", NORM: "2018"},
 | 
				
			||||||
    {ORTH: "2к19", LEMMA: "2019", NORM: "2019"},
 | 
					    {ORTH: "2к19", NORM: "2019"},
 | 
				
			||||||
    {ORTH: "2к20", LEMMA: "2020", NORM: "2020"},
 | 
					    {ORTH: "2к20", NORM: "2020"},
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for slang_desc in _slang_exc:
 | 
					for slang_desc in _slang_exc:
 | 
				
			||||||
    _exc[slang_desc[ORTH]] = [slang_desc]
 | 
					    _exc[slang_desc[ORTH]] = [slang_desc]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,16 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class SinhalaDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
lang = "si"
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
stop_words = {"@language_data": "spacy.si.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.si.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.si.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Sinhala(Language):
 | 
					class Sinhala(Language):
 | 
				
			||||||
    lang = "si"
 | 
					    lang = "si"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = SinhalaDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Sinhala"]
 | 
					__all__ = ["Sinhala"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,16 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class SlovakDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
lang = "sk"
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
stop_words = {"@language_data": "spacy.sk.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.sk.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.sk.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Slovak(Language):
 | 
					class Slovak(Language):
 | 
				
			||||||
    lang = "sk"
 | 
					    lang = "sk"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = SlovakDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Slovak"]
 | 
					__all__ = ["Slovak"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class SlovenianDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "sl"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.sl.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.sl.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Slovenian(Language):
 | 
					class Slovenian(Language):
 | 
				
			||||||
    lang = "sl"
 | 
					    lang = "sl"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = SlovenianDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Slovenian"]
 | 
					__all__ = ["Slovenian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,14 @@
 | 
				
			||||||
from typing import Set
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class AlbanianDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
lang = "sq"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.sq.stop_words"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.sq.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Albanian(Language):
 | 
					class Albanian(Language):
 | 
				
			||||||
    lang = "sq"
 | 
					    lang = "sq"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = AlbanianDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Albanian"]
 | 
					__all__ = ["Albanian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,47 +1,18 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "sr"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.sr.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.sr.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.sr.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SerbianDefaults(Language.Defaults):
 | 
					class SerbianDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Serbian(Language):
 | 
					class Serbian(Language):
 | 
				
			||||||
    lang = "sr"
 | 
					    lang = "sr"
 | 
				
			||||||
    Defaults = SerbianDefaults
 | 
					    Defaults = SerbianDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Serbian"]
 | 
					__all__ = ["Serbian"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,93 +1,93 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA, NORM
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import ORTH, NORM
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_abbrev_exc = [
 | 
					_abbrev_exc = [
 | 
				
			||||||
    # Weekdays abbreviations
 | 
					    # Weekdays abbreviations
 | 
				
			||||||
    {ORTH: "пoн", LEMMA: "понедељак", NORM: "понедељак"},
 | 
					    {ORTH: "пoн", NORM: "понедељак"},
 | 
				
			||||||
    {ORTH: "уто", LEMMA: "уторак", NORM: "уторак"},
 | 
					    {ORTH: "уто", NORM: "уторак"},
 | 
				
			||||||
    {ORTH: "сре", LEMMA: "среда", NORM: "среда"},
 | 
					    {ORTH: "сре", NORM: "среда"},
 | 
				
			||||||
    {ORTH: "чет", LEMMA: "четвртак", NORM: "четвртак"},
 | 
					    {ORTH: "чет", NORM: "четвртак"},
 | 
				
			||||||
    {ORTH: "пет", LEMMA: "петак", NORM: "петак"},
 | 
					    {ORTH: "пет", NORM: "петак"},
 | 
				
			||||||
    {ORTH: "суб", LEMMA: "субота", NORM: "субота"},
 | 
					    {ORTH: "суб", NORM: "субота"},
 | 
				
			||||||
    {ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
 | 
					    {ORTH: "нед", NORM: "недеља"},
 | 
				
			||||||
    # Months abbreviations
 | 
					    # Months abbreviations
 | 
				
			||||||
    {ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
 | 
					    {ORTH: "јан", NORM: "јануар"},
 | 
				
			||||||
    {ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
 | 
					    {ORTH: "феб", NORM: "фебруар"},
 | 
				
			||||||
    {ORTH: "мар", LEMMA: "март", NORM: "март"},
 | 
					    {ORTH: "мар", NORM: "март"},
 | 
				
			||||||
    {ORTH: "апр", LEMMA: "април", NORM: "април"},
 | 
					    {ORTH: "апр", NORM: "април"},
 | 
				
			||||||
    {ORTH: "јуни", LEMMA: "јун", NORM: "јун"},
 | 
					    {ORTH: "јуни", NORM: "јун"},
 | 
				
			||||||
    {ORTH: "јули", LEMMA: "јул", NORM: "јул"},
 | 
					    {ORTH: "јули", NORM: "јул"},
 | 
				
			||||||
    {ORTH: "авг", LEMMA: "август", NORM: "август"},
 | 
					    {ORTH: "авг", NORM: "август"},
 | 
				
			||||||
    {ORTH: "сеп", LEMMA: "септембар", NORM: "септембар"},
 | 
					    {ORTH: "сеп", NORM: "септембар"},
 | 
				
			||||||
    {ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
 | 
					    {ORTH: "септ", NORM: "септембар"},
 | 
				
			||||||
    {ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
 | 
					    {ORTH: "окт", NORM: "октобар"},
 | 
				
			||||||
    {ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
 | 
					    {ORTH: "нов", NORM: "новембар"},
 | 
				
			||||||
    {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
 | 
					    {ORTH: "дец", NORM: "децембар"},
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for abbrev_desc in _abbrev_exc:
 | 
					for abbrev_desc in _abbrev_exc:
 | 
				
			||||||
    abbrev = abbrev_desc[ORTH]
 | 
					    abbrev = abbrev_desc[ORTH]
 | 
				
			||||||
    for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
 | 
					    for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
 | 
				
			||||||
        _exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
 | 
					        _exc[orth] = [{ORTH: orth, NORM: abbrev_desc[NORM]}]
 | 
				
			||||||
        _exc[orth + "."] = [
 | 
					        _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
 | 
				
			||||||
            {ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# common abbreviations
 | 
					# common abbreviations
 | 
				
			||||||
_slang_exc = [
 | 
					_slang_exc = [
 | 
				
			||||||
    # without dot
 | 
					    # without dot
 | 
				
			||||||
    {ORTH: "др", LEMMA: "доктор", NORM: "доктор"},
 | 
					    {ORTH: "др", NORM: "доктор"},
 | 
				
			||||||
    {ORTH: "гдин", LEMMA: "господин", NORM: "господин"},
 | 
					    {ORTH: "гдин", NORM: "господин"},
 | 
				
			||||||
    {ORTH: "гђа", LEMMA: "госпођа", NORM: "госпођа"},
 | 
					    {ORTH: "гђа", NORM: "госпођа"},
 | 
				
			||||||
    {ORTH: "гђица", LEMMA: "госпођица", NORM: "госпођица"},
 | 
					    {ORTH: "гђица", NORM: "госпођица"},
 | 
				
			||||||
    {ORTH: "мр", LEMMA: "магистар", NORM: "магистар"},
 | 
					    {ORTH: "мр", NORM: "магистар"},
 | 
				
			||||||
    {ORTH: "Бгд", LEMMA: "Београд", NORM: "београд"},
 | 
					    {ORTH: "Бгд", NORM: "београд"},
 | 
				
			||||||
    {ORTH: "цм", LEMMA: "центиметар", NORM: "центиметар"},
 | 
					    {ORTH: "цм", NORM: "центиметар"},
 | 
				
			||||||
    {ORTH: "м", LEMMA: "метар", NORM: "метар"},
 | 
					    {ORTH: "м", NORM: "метар"},
 | 
				
			||||||
    {ORTH: "км", LEMMA: "километар", NORM: "километар"},
 | 
					    {ORTH: "км", NORM: "километар"},
 | 
				
			||||||
    {ORTH: "мг", LEMMA: "милиграм", NORM: "милиграм"},
 | 
					    {ORTH: "мг", NORM: "милиграм"},
 | 
				
			||||||
    {ORTH: "кг", LEMMA: "килограм", NORM: "килограм"},
 | 
					    {ORTH: "кг", NORM: "килограм"},
 | 
				
			||||||
    {ORTH: "дл", LEMMA: "децилитар", NORM: "децилитар"},
 | 
					    {ORTH: "дл", NORM: "децилитар"},
 | 
				
			||||||
    {ORTH: "хл", LEMMA: "хектолитар", NORM: "хектолитар"},
 | 
					    {ORTH: "хл", NORM: "хектолитар"},
 | 
				
			||||||
    # with dot
 | 
					    # with dot
 | 
				
			||||||
    {ORTH: "ул.", LEMMA: "улица", NORM: "улица"},
 | 
					    {ORTH: "ул.", NORM: "улица"},
 | 
				
			||||||
    {ORTH: "бр.", LEMMA: "број", NORM: "број"},
 | 
					    {ORTH: "бр.", NORM: "број"},
 | 
				
			||||||
    {ORTH: "нпр.", LEMMA: "на пример", NORM: "на пример"},
 | 
					    {ORTH: "нпр.", NORM: "на пример"},
 | 
				
			||||||
    {ORTH: "тзв.", LEMMA: "такозван", NORM: "такозван"},
 | 
					    {ORTH: "тзв.", NORM: "такозван"},
 | 
				
			||||||
    {ORTH: "проф.", LEMMA: "професор", NORM: "професор"},
 | 
					    {ORTH: "проф.", NORM: "професор"},
 | 
				
			||||||
    {ORTH: "стр.", LEMMA: "страна", NORM: "страна"},
 | 
					    {ORTH: "стр.", NORM: "страна"},
 | 
				
			||||||
    {ORTH: "једн.", LEMMA: "једнина", NORM: "једнина"},
 | 
					    {ORTH: "једн.", NORM: "једнина"},
 | 
				
			||||||
    {ORTH: "мн.", LEMMA: "множина", NORM: "множина"},
 | 
					    {ORTH: "мн.", NORM: "множина"},
 | 
				
			||||||
    {ORTH: "уч.", LEMMA: "ученик", NORM: "ученик"},
 | 
					    {ORTH: "уч.", NORM: "ученик"},
 | 
				
			||||||
    {ORTH: "разр.", LEMMA: "разред", NORM: "разред"},
 | 
					    {ORTH: "разр.", NORM: "разред"},
 | 
				
			||||||
    {ORTH: "инж.", LEMMA: "инжењер", NORM: "инжењер"},
 | 
					    {ORTH: "инж.", NORM: "инжењер"},
 | 
				
			||||||
    {ORTH: "гимн.", LEMMA: "гимназија", NORM: "гимназија"},
 | 
					    {ORTH: "гимн.", NORM: "гимназија"},
 | 
				
			||||||
    {ORTH: "год.", LEMMA: "година", NORM: "година"},
 | 
					    {ORTH: "год.", NORM: "година"},
 | 
				
			||||||
    {ORTH: "мед.", LEMMA: "медицина", NORM: "медицина"},
 | 
					    {ORTH: "мед.", NORM: "медицина"},
 | 
				
			||||||
    {ORTH: "гимн.", LEMMA: "гимназија", NORM: "гимназија"},
 | 
					    {ORTH: "гимн.", NORM: "гимназија"},
 | 
				
			||||||
    {ORTH: "акад.", LEMMA: "академик", NORM: "академик"},
 | 
					    {ORTH: "акад.", NORM: "академик"},
 | 
				
			||||||
    {ORTH: "доц.", LEMMA: "доцент", NORM: "доцент"},
 | 
					    {ORTH: "доц.", NORM: "доцент"},
 | 
				
			||||||
    {ORTH: "итд.", LEMMA: "и тако даље", NORM: "и тако даље"},
 | 
					    {ORTH: "итд.", NORM: "и тако даље"},
 | 
				
			||||||
    {ORTH: "и сл.", LEMMA: "и слично", NORM: "и слично"},
 | 
					    {ORTH: "и сл.", NORM: "и слично"},
 | 
				
			||||||
    {ORTH: "н.е.", LEMMA: "нова ера", NORM: "нове ере"},
 | 
					    {ORTH: "н.е.", NORM: "нове ере"},
 | 
				
			||||||
    {ORTH: "о.г.", LEMMA: "ова година", NORM: "ове године"},
 | 
					    {ORTH: "о.г.", NORM: "ове године"},
 | 
				
			||||||
    {ORTH: "л.к.", LEMMA: "лична карта", NORM: "лична карта"},
 | 
					    {ORTH: "л.к.", NORM: "лична карта"},
 | 
				
			||||||
    {ORTH: "в.д.", LEMMA: "вршилац дужности", NORM: "вршилац дужности"},
 | 
					    {ORTH: "в.д.", NORM: "вршилац дужности"},
 | 
				
			||||||
    {ORTH: "стр.", LEMMA: "страна", NORM: "страна"},
 | 
					    {ORTH: "стр.", NORM: "страна"},
 | 
				
			||||||
    # with qoute
 | 
					    # with qoute
 | 
				
			||||||
    {ORTH: "ал'", LEMMA: "али", NORM: "али"},
 | 
					    {ORTH: "ал'", NORM: "али"},
 | 
				
			||||||
    {ORTH: "ил'", LEMMA: "или", NORM: "или"},
 | 
					    {ORTH: "ил'", NORM: "или"},
 | 
				
			||||||
    {ORTH: "је л'", LEMMA: "је ли", NORM: "је ли"},
 | 
					    {ORTH: "је л'", NORM: "је ли"},
 | 
				
			||||||
    {ORTH: "да л'", LEMMA: "да ли", NORM: "да ли"},
 | 
					    {ORTH: "да л'", NORM: "да ли"},
 | 
				
			||||||
    {ORTH: "држ'те", LEMMA: "држати", NORM: "држите"},
 | 
					    {ORTH: "држ'те", NORM: "држите"},
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for slang_desc in _slang_exc:
 | 
					for slang_desc in _slang_exc:
 | 
				
			||||||
    _exc[slang_desc[ORTH]] = [slang_desc]
 | 
					    _exc[slang_desc[ORTH]] = [slang_desc]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,54 +1,25 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language
 | 
					 | 
				
			||||||
from ...util import update_exc, registry
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from ...language import Language
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Punctuation stolen from Danish
 | 
					# Punctuation stolen from Danish
 | 
				
			||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					 | 
				
			||||||
[nlp]
 | 
					 | 
				
			||||||
lang = "sv"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.sv.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer]
 | 
					 | 
				
			||||||
@lemmatizers = "spacy.Lemmatizer.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.lemmatizer.data_paths]
 | 
					 | 
				
			||||||
@language_data = "spacy-lookups-data"
 | 
					 | 
				
			||||||
lang = ${nlp:lang}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.sv.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.sv.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class SwedishDefaults(Language.Defaults):
 | 
					class SwedishDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Swedish(Language):
 | 
					class Swedish(Language):
 | 
				
			||||||
    lang = "sv"
 | 
					    lang = "sv"
 | 
				
			||||||
    Defaults = SwedishDefaults
 | 
					    Defaults = SwedishDefaults
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Swedish"]
 | 
					__all__ = ["Swedish"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,27 +1,18 @@
 | 
				
			||||||
 | 
					from typing import Union, Iterator
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					from ...symbols import NOUN, PROPN, PRON
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike):
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
 | 
				
			||||||
    """
 | 
					    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
				
			||||||
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
 | 
					    # fmt: off
 | 
				
			||||||
    """
 | 
					    labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
 | 
				
			||||||
    labels = [
 | 
					    # fmt: on
 | 
				
			||||||
        "nsubj",
 | 
					 | 
				
			||||||
        "nsubj:pass",
 | 
					 | 
				
			||||||
        "dobj",
 | 
					 | 
				
			||||||
        "obj",
 | 
					 | 
				
			||||||
        "iobj",
 | 
					 | 
				
			||||||
        "ROOT",
 | 
					 | 
				
			||||||
        "appos",
 | 
					 | 
				
			||||||
        "nmod",
 | 
					 | 
				
			||||||
        "nmod:poss",
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
					    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not doc.is_parsed:
 | 
					    if not doc.is_parsed:
 | 
				
			||||||
        raise ValueError(Errors.E029)
 | 
					        raise ValueError(Errors.E029)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
					    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,6 @@
 | 
				
			||||||
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,61 +10,58 @@ _exc = {}
 | 
				
			||||||
for verb_data in [
 | 
					for verb_data in [
 | 
				
			||||||
    {ORTH: "driver"},
 | 
					    {ORTH: "driver"},
 | 
				
			||||||
    {ORTH: "kör"},
 | 
					    {ORTH: "kör"},
 | 
				
			||||||
    {ORTH: "hörr", LEMMA: "hör"},
 | 
					    {ORTH: "hörr"},
 | 
				
			||||||
    {ORTH: "fattar"},
 | 
					    {ORTH: "fattar"},
 | 
				
			||||||
    {ORTH: "hajar", LEMMA: "förstår"},
 | 
					    {ORTH: "hajar"},
 | 
				
			||||||
    {ORTH: "lever"},
 | 
					    {ORTH: "lever"},
 | 
				
			||||||
    {ORTH: "serr", LEMMA: "ser"},
 | 
					    {ORTH: "serr"},
 | 
				
			||||||
    {ORTH: "fixar"},
 | 
					    {ORTH: "fixar"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    verb_data_tc = dict(verb_data)
 | 
					    verb_data_tc = dict(verb_data)
 | 
				
			||||||
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
 | 
					    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
 | 
				
			||||||
    for data in [verb_data, verb_data_tc]:
 | 
					    for data in [verb_data, verb_data_tc]:
 | 
				
			||||||
        _exc[data[ORTH] + "u"] = [
 | 
					        _exc[data[ORTH] + "u"] = [data, {ORTH: "u", NORM: "du"}]
 | 
				
			||||||
            dict(data),
 | 
					 | 
				
			||||||
            {ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"},
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Abbreviations for weekdays "sön." (for "söndag" / "söner")
 | 
					# Abbreviations for weekdays "sön." (for "söndag" / "söner")
 | 
				
			||||||
# are left out because they are ambiguous. The same is the case
 | 
					# are left out because they are ambiguous. The same is the case
 | 
				
			||||||
# for abbreviations "jul." and "Jul." ("juli" / "jul").
 | 
					# for abbreviations "jul." and "Jul." ("juli" / "jul").
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
    {ORTH: "jan.", LEMMA: "januari"},
 | 
					    {ORTH: "jan.", NORM: "januari"},
 | 
				
			||||||
    {ORTH: "febr.", LEMMA: "februari"},
 | 
					    {ORTH: "febr.", NORM: "februari"},
 | 
				
			||||||
    {ORTH: "feb.", LEMMA: "februari"},
 | 
					    {ORTH: "feb.", NORM: "februari"},
 | 
				
			||||||
    {ORTH: "apr.", LEMMA: "april"},
 | 
					    {ORTH: "apr.", NORM: "april"},
 | 
				
			||||||
    {ORTH: "jun.", LEMMA: "juni"},
 | 
					    {ORTH: "jun.", NORM: "juni"},
 | 
				
			||||||
    {ORTH: "aug.", LEMMA: "augusti"},
 | 
					    {ORTH: "aug.", NORM: "augusti"},
 | 
				
			||||||
    {ORTH: "sept.", LEMMA: "september"},
 | 
					    {ORTH: "sept.", NORM: "september"},
 | 
				
			||||||
    {ORTH: "sep.", LEMMA: "september"},
 | 
					    {ORTH: "sep.", NORM: "september"},
 | 
				
			||||||
    {ORTH: "okt.", LEMMA: "oktober"},
 | 
					    {ORTH: "okt.", NORM: "oktober"},
 | 
				
			||||||
    {ORTH: "nov.", LEMMA: "november"},
 | 
					    {ORTH: "nov.", NORM: "november"},
 | 
				
			||||||
    {ORTH: "dec.", LEMMA: "december"},
 | 
					    {ORTH: "dec.", NORM: "december"},
 | 
				
			||||||
    {ORTH: "mån.", LEMMA: "måndag"},
 | 
					    {ORTH: "mån.", NORM: "måndag"},
 | 
				
			||||||
    {ORTH: "tis.", LEMMA: "tisdag"},
 | 
					    {ORTH: "tis.", NORM: "tisdag"},
 | 
				
			||||||
    {ORTH: "ons.", LEMMA: "onsdag"},
 | 
					    {ORTH: "ons.", NORM: "onsdag"},
 | 
				
			||||||
    {ORTH: "tors.", LEMMA: "torsdag"},
 | 
					    {ORTH: "tors.", NORM: "torsdag"},
 | 
				
			||||||
    {ORTH: "fre.", LEMMA: "fredag"},
 | 
					    {ORTH: "fre.", NORM: "fredag"},
 | 
				
			||||||
    {ORTH: "lör.", LEMMA: "lördag"},
 | 
					    {ORTH: "lör.", NORM: "lördag"},
 | 
				
			||||||
    {ORTH: "Jan.", LEMMA: "Januari"},
 | 
					    {ORTH: "Jan.", NORM: "Januari"},
 | 
				
			||||||
    {ORTH: "Febr.", LEMMA: "Februari"},
 | 
					    {ORTH: "Febr.", NORM: "Februari"},
 | 
				
			||||||
    {ORTH: "Feb.", LEMMA: "Februari"},
 | 
					    {ORTH: "Feb.", NORM: "Februari"},
 | 
				
			||||||
    {ORTH: "Apr.", LEMMA: "April"},
 | 
					    {ORTH: "Apr.", NORM: "April"},
 | 
				
			||||||
    {ORTH: "Jun.", LEMMA: "Juni"},
 | 
					    {ORTH: "Jun.", NORM: "Juni"},
 | 
				
			||||||
    {ORTH: "Aug.", LEMMA: "Augusti"},
 | 
					    {ORTH: "Aug.", NORM: "Augusti"},
 | 
				
			||||||
    {ORTH: "Sept.", LEMMA: "September"},
 | 
					    {ORTH: "Sept.", NORM: "September"},
 | 
				
			||||||
    {ORTH: "Sep.", LEMMA: "September"},
 | 
					    {ORTH: "Sep.", NORM: "September"},
 | 
				
			||||||
    {ORTH: "Okt.", LEMMA: "Oktober"},
 | 
					    {ORTH: "Okt.", NORM: "Oktober"},
 | 
				
			||||||
    {ORTH: "Nov.", LEMMA: "November"},
 | 
					    {ORTH: "Nov.", NORM: "November"},
 | 
				
			||||||
    {ORTH: "Dec.", LEMMA: "December"},
 | 
					    {ORTH: "Dec.", NORM: "December"},
 | 
				
			||||||
    {ORTH: "Mån.", LEMMA: "Måndag"},
 | 
					    {ORTH: "Mån.", NORM: "Måndag"},
 | 
				
			||||||
    {ORTH: "Tis.", LEMMA: "Tisdag"},
 | 
					    {ORTH: "Tis.", NORM: "Tisdag"},
 | 
				
			||||||
    {ORTH: "Ons.", LEMMA: "Onsdag"},
 | 
					    {ORTH: "Ons.", NORM: "Onsdag"},
 | 
				
			||||||
    {ORTH: "Tors.", LEMMA: "Torsdag"},
 | 
					    {ORTH: "Tors.", NORM: "Torsdag"},
 | 
				
			||||||
    {ORTH: "Fre.", LEMMA: "Fredag"},
 | 
					    {ORTH: "Fre.", NORM: "Fredag"},
 | 
				
			||||||
    {ORTH: "Lör.", LEMMA: "Lördag"},
 | 
					    {ORTH: "Lör.", NORM: "Lördag"},
 | 
				
			||||||
    {ORTH: "sthlm", LEMMA: "Stockholm"},
 | 
					    {ORTH: "sthlm", NORM: "Stockholm"},
 | 
				
			||||||
    {ORTH: "gbg", LEMMA: "Göteborg"},
 | 
					    {ORTH: "gbg", NORM: "Göteborg"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -152,6 +151,6 @@ for orth in ABBREVIATIONS:
 | 
				
			||||||
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
 | 
					# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
 | 
				
			||||||
# should be tokenized as two separate tokens.
 | 
					# should be tokenized as two separate tokens.
 | 
				
			||||||
for orth in ["i", "m"]:
 | 
					for orth in ["i", "m"]:
 | 
				
			||||||
    _exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}]
 | 
					    _exc[orth + "."] = [{ORTH: orth, NORM: orth}, {ORTH: "."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,16 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class TamilDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
lang = "ta"
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
stop_words = {"@language_data": "spacy.ta.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ta.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.ta.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Tamil(Language):
 | 
					class Tamil(Language):
 | 
				
			||||||
    lang = "ta"
 | 
					    lang = "ta"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = TamilDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Tamil"]
 | 
					__all__ = ["Tamil"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,25 +0,0 @@
 | 
				
			||||||
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
 | 
					 | 
				
			||||||
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
TAG_MAP = {
 | 
					 | 
				
			||||||
    "ADV": {POS: ADV},
 | 
					 | 
				
			||||||
    "NOUN": {POS: NOUN},
 | 
					 | 
				
			||||||
    "ADP": {POS: ADP},
 | 
					 | 
				
			||||||
    "PRON": {POS: PRON},
 | 
					 | 
				
			||||||
    "SCONJ": {POS: SCONJ},
 | 
					 | 
				
			||||||
    "PROPN": {POS: PROPN},
 | 
					 | 
				
			||||||
    "DET": {POS: DET},
 | 
					 | 
				
			||||||
    "SYM": {POS: SYM},
 | 
					 | 
				
			||||||
    "INTJ": {POS: INTJ},
 | 
					 | 
				
			||||||
    "PUNCT": {POS: PUNCT},
 | 
					 | 
				
			||||||
    "NUM": {POS: NUM},
 | 
					 | 
				
			||||||
    "AUX": {POS: AUX},
 | 
					 | 
				
			||||||
    "X": {POS: X},
 | 
					 | 
				
			||||||
    "CONJ": {POS: CONJ},
 | 
					 | 
				
			||||||
    "CCONJ": {POS: CCONJ},
 | 
					 | 
				
			||||||
    "ADJ": {POS: ADJ},
 | 
					 | 
				
			||||||
    "VERB": {POS: VERB},
 | 
					 | 
				
			||||||
    "PART": {POS: PART},
 | 
					 | 
				
			||||||
    "_SP": {POS: SPACE},
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,33 +1,16 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...util import registry
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					class TeluguDefaults(Language.Defaults):
 | 
				
			||||||
[nlp]
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
lang = "te"
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
stop_words = {"@language_data": "spacy.te.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.te.stop_words")
 | 
					 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.te.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Telugu(Language):
 | 
					class Telugu(Language):
 | 
				
			||||||
    lang = "te"
 | 
					    lang = "te"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = TeluguDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Telugu"]
 | 
					__all__ = ["Telugu"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
from typing import Set, Dict, Callable, Any
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
| 
						 | 
					@ -10,26 +9,13 @@ from ...util import DummyTokenizer, registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "th"
 | 
					 | 
				
			||||||
stop_words = {"@language_data": "spacy.th.stop_words"}
 | 
					 | 
				
			||||||
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.tokenizer]
 | 
					[nlp.tokenizer]
 | 
				
			||||||
@tokenizers = "spacy.ThaiTokenizer.v1"
 | 
					@tokenizers = "spacy.th.ThaiTokenizer"
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.language_data("spacy.th.stop_words")
 | 
					@registry.tokenizers("spacy.th.ThaiTokenizer")
 | 
				
			||||||
def stop_words() -> Set[str]:
 | 
					 | 
				
			||||||
    return STOP_WORDS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.language_data("spacy.th.lex_attr_getters")
 | 
					 | 
				
			||||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
 | 
					 | 
				
			||||||
    return LEX_ATTRS
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@registry.tokenizers("spacy.ThaiTokenizer.v1")
 | 
					 | 
				
			||||||
def create_thai_tokenizer():
 | 
					def create_thai_tokenizer():
 | 
				
			||||||
    def thai_tokenizer_factory(nlp):
 | 
					    def thai_tokenizer_factory(nlp):
 | 
				
			||||||
        return ThaiTokenizer(nlp)
 | 
					        return ThaiTokenizer(nlp)
 | 
				
			||||||
| 
						 | 
					@ -55,9 +41,15 @@ class ThaiTokenizer(DummyTokenizer):
 | 
				
			||||||
        return Doc(self.vocab, words=words, spaces=spaces)
 | 
					        return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ThaiDefaults(Language.Defaults):
 | 
				
			||||||
 | 
					    config = Config().from_str(DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Thai(Language):
 | 
					class Thai(Language):
 | 
				
			||||||
    lang = "th"
 | 
					    lang = "th"
 | 
				
			||||||
    default_config = Config().from_str(DEFAULT_CONFIG)
 | 
					    Defaults = ThaiDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Thai"]
 | 
					__all__ = ["Thai"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,469 +1,438 @@
 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {
 | 
					_exc = {
 | 
				
			||||||
    # หน่วยงานรัฐ / government agency
 | 
					    # หน่วยงานรัฐ / government agency
 | 
				
			||||||
    "กกต.": [{ORTH: "กกต.", LEMMA: "คณะกรรมการการเลือกตั้ง"}],
 | 
					    "กกต.": [{ORTH: "กกต."}],
 | 
				
			||||||
    "กทท.": [{ORTH: "กทท.", LEMMA: "การท่าเรือแห่งประเทศไทย"}],
 | 
					    "กทท.": [{ORTH: "กทท."}],
 | 
				
			||||||
    "กทพ.": [{ORTH: "กทพ.", LEMMA: "การทางพิเศษแห่งประเทศไทย"}],
 | 
					    "กทพ.": [{ORTH: "กทพ."}],
 | 
				
			||||||
    "กบข.": [{ORTH: "กบข.", LEMMA: "กองทุนบำเหน็จบำนาญข้าราชการพลเรือน"}],
 | 
					    "กบข.": [{ORTH: "กบข."}],
 | 
				
			||||||
    "กบว.": [{ORTH: "กบว.", LEMMA: "คณะกรรมการบริหารวิทยุกระจายเสียงและวิทยุโทรทัศน์"}],
 | 
					    "กบว.": [{ORTH: "กบว."}],
 | 
				
			||||||
    "กปน.": [{ORTH: "กปน.", LEMMA: "การประปานครหลวง"}],
 | 
					    "กปน.": [{ORTH: "กปน."}],
 | 
				
			||||||
    "กปภ.": [{ORTH: "กปภ.", LEMMA: "การประปาส่วนภูมิภาค"}],
 | 
					    "กปภ.": [{ORTH: "กปภ."}],
 | 
				
			||||||
    "กปส.": [{ORTH: "กปส.", LEMMA: "กรมประชาสัมพันธ์"}],
 | 
					    "กปส.": [{ORTH: "กปส."}],
 | 
				
			||||||
    "กผม.": [{ORTH: "กผม.", LEMMA: "กองผังเมือง"}],
 | 
					    "กผม.": [{ORTH: "กผม."}],
 | 
				
			||||||
    "กฟน.": [{ORTH: "กฟน.", LEMMA: "การไฟฟ้านครหลวง"}],
 | 
					    "กฟน.": [{ORTH: "กฟน."}],
 | 
				
			||||||
    "กฟผ.": [{ORTH: "กฟผ.", LEMMA: "การไฟฟ้าฝ่ายผลิตแห่งประเทศไทย"}],
 | 
					    "กฟผ.": [{ORTH: "กฟผ."}],
 | 
				
			||||||
    "กฟภ.": [{ORTH: "กฟภ.", LEMMA: "การไฟฟ้าส่วนภูมิภาค"}],
 | 
					    "กฟภ.": [{ORTH: "กฟภ."}],
 | 
				
			||||||
    "ก.ช.น.": [{ORTH: "ก.ช.น.", LEMMA: "คณะกรรมการช่วยเหลือชาวนาชาวไร่"}],
 | 
					    "ก.ช.น.": [{ORTH: "ก.ช.น."}],
 | 
				
			||||||
    "กยศ.": [{ORTH: "กยศ.", LEMMA: "กองทุนเงินให้กู้ยืมเพื่อการศึกษา"}],
 | 
					    "กยศ.": [{ORTH: "กยศ."}],
 | 
				
			||||||
    "ก.ล.ต.": [{ORTH: "ก.ล.ต.", LEMMA: "คณะกรรมการกำกับหลักทรัพย์และตลาดหลักทรัพย์"}],
 | 
					    "ก.ล.ต.": [{ORTH: "ก.ล.ต."}],
 | 
				
			||||||
    "กศ.บ.": [{ORTH: "กศ.บ.", LEMMA: "การศึกษาบัณฑิต"}],
 | 
					    "กศ.บ.": [{ORTH: "กศ.บ."}],
 | 
				
			||||||
    "กศน.": [{ORTH: "กศน.", LEMMA: "กรมการศึกษานอกโรงเรียน"}],
 | 
					    "กศน.": [{ORTH: "กศน."}],
 | 
				
			||||||
    "กสท.": [{ORTH: "กสท.", LEMMA: "การสื่อสารแห่งประเทศไทย"}],
 | 
					    "กสท.": [{ORTH: "กสท."}],
 | 
				
			||||||
    "กอ.รมน.": [{ORTH: "กอ.รมน.", LEMMA: "กองอำนวยการรักษาความมั่นคงภายใน"}],
 | 
					    "กอ.รมน.": [{ORTH: "กอ.รมน."}],
 | 
				
			||||||
    "กร.": [{ORTH: "กร.", LEMMA: "กองเรือยุทธการ"}],
 | 
					    "กร.": [{ORTH: "กร."}],
 | 
				
			||||||
    "ขสมก.": [{ORTH: "ขสมก.", LEMMA: "องค์การขนส่งมวลชนกรุงเทพ"}],
 | 
					    "ขสมก.": [{ORTH: "ขสมก."}],
 | 
				
			||||||
    "คตง.": [{ORTH: "คตง.", LEMMA: "คณะกรรมการตรวจเงินแผ่นดิน"}],
 | 
					    "คตง.": [{ORTH: "คตง."}],
 | 
				
			||||||
    "ครม.": [{ORTH: "ครม.", LEMMA: "คณะรัฐมนตรี"}],
 | 
					    "ครม.": [{ORTH: "ครม."}],
 | 
				
			||||||
    "คมช.": [{ORTH: "คมช.", LEMMA: "คณะมนตรีความมั่นคงแห่งชาติ"}],
 | 
					    "คมช.": [{ORTH: "คมช."}],
 | 
				
			||||||
    "ตชด.": [{ORTH: "ตชด.", LEMMA: "ตำรวจตะเวนชายเดน"}],
 | 
					    "ตชด.": [{ORTH: "ตชด."}],
 | 
				
			||||||
    "ตม.": [{ORTH: "ตม.", LEMMA: "กองตรวจคนเข้าเมือง"}],
 | 
					    "ตม.": [{ORTH: "ตม."}],
 | 
				
			||||||
    "ตร.": [{ORTH: "ตร.", LEMMA: "ตำรวจ"}],
 | 
					    "ตร.": [{ORTH: "ตร."}],
 | 
				
			||||||
    "ททท.": [{ORTH: "ททท.", LEMMA: "การท่องเที่ยวแห่งประเทศไทย"}],
 | 
					    "ททท.": [{ORTH: "ททท."}],
 | 
				
			||||||
    "ททบ.": [{ORTH: "ททบ.", LEMMA: "สถานีวิทยุโทรทัศน์กองทัพบก"}],
 | 
					    "ททบ.": [{ORTH: "ททบ."}],
 | 
				
			||||||
    "ทบ.": [{ORTH: "ทบ.", LEMMA: "กองทัพบก"}],
 | 
					    "ทบ.": [{ORTH: "ทบ."}],
 | 
				
			||||||
    "ทร.": [{ORTH: "ทร.", LEMMA: "กองทัพเรือ"}],
 | 
					    "ทร.": [{ORTH: "ทร."}],
 | 
				
			||||||
    "ทอ.": [{ORTH: "ทอ.", LEMMA: "กองทัพอากาศ"}],
 | 
					    "ทอ.": [{ORTH: "ทอ."}],
 | 
				
			||||||
    "ทอท.": [{ORTH: "ทอท.", LEMMA: "การท่าอากาศยานแห่งประเทศไทย"}],
 | 
					    "ทอท.": [{ORTH: "ทอท."}],
 | 
				
			||||||
    "ธ.ก.ส.": [{ORTH: "ธ.ก.ส.", LEMMA: "ธนาคารเพื่อการเกษตรและสหกรณ์การเกษตร"}],
 | 
					    "ธ.ก.ส.": [{ORTH: "ธ.ก.ส."}],
 | 
				
			||||||
    "ธปท.": [{ORTH: "ธปท.", LEMMA: "ธนาคารแห่งประเทศไทย"}],
 | 
					    "ธปท.": [{ORTH: "ธปท."}],
 | 
				
			||||||
    "ธอส.": [{ORTH: "ธอส.", LEMMA: "ธนาคารอาคารสงเคราะห์"}],
 | 
					    "ธอส.": [{ORTH: "ธอส."}],
 | 
				
			||||||
    "นย.": [{ORTH: "นย.", LEMMA: "นาวิกโยธิน"}],
 | 
					    "นย.": [{ORTH: "นย."}],
 | 
				
			||||||
    "ปตท.": [{ORTH: "ปตท.", LEMMA: "การปิโตรเลียมแห่งประเทศไทย"}],
 | 
					    "ปตท.": [{ORTH: "ปตท."}],
 | 
				
			||||||
    "ป.ป.ช.": [
 | 
					    "ป.ป.ช.": [{ORTH: "ป.ป.ช."}],
 | 
				
			||||||
        {
 | 
					    "ป.ป.ส.": [{ORTH: "ป.ป.ส."}],
 | 
				
			||||||
            ORTH: "ป.ป.ช.",
 | 
					    "บพร.": [{ORTH: "บพร."}],
 | 
				
			||||||
            LEMMA: "คณะกรรมการป้องกันและปราบปรามการทุจริตและประพฤติมิชอบในวงราชการ",
 | 
					    "บย.": [{ORTH: "บย."}],
 | 
				
			||||||
        }
 | 
					    "พสวท.": [{ORTH: "พสวท."}],
 | 
				
			||||||
    ],
 | 
					    "มอก.": [{ORTH: "มอก."}],
 | 
				
			||||||
    "ป.ป.ส.": [{ORTH: "ป.ป.ส.", LEMMA: "คณะกรรมการป้องกันและปราบปรามยาเสพติด"}],
 | 
					    "ยธ.": [{ORTH: "ยธ."}],
 | 
				
			||||||
    "บพร.": [{ORTH: "บพร.", LEMMA: "กรมการบินพลเรือน"}],
 | 
					    "รพช.": [{ORTH: "รพช."}],
 | 
				
			||||||
    "บย.": [{ORTH: "บย.", LEMMA: "กองบินยุทธการ"}],
 | 
					    "รฟท.": [{ORTH: "รฟท."}],
 | 
				
			||||||
    "พสวท.": [
 | 
					    "รฟม.": [{ORTH: "รฟม."}],
 | 
				
			||||||
        {
 | 
					    "ศธ.": [{ORTH: "ศธ."}],
 | 
				
			||||||
            ORTH: "พสวท.",
 | 
					    "ศนธ.": [{ORTH: "ศนธ."}],
 | 
				
			||||||
            LEMMA: "โครงการพัฒนาและส่งเสริมผู้มีความรู้ความสามารถพิเศษทางวิทยาศาสตร์และเทคโนโลยี",
 | 
					    "สกจ.": [{ORTH: "สกจ."}],
 | 
				
			||||||
        }
 | 
					    "สกท.": [{ORTH: "สกท."}],
 | 
				
			||||||
    ],
 | 
					    "สกว.": [{ORTH: "สกว."}],
 | 
				
			||||||
    "มอก.": [{ORTH: "มอก.", LEMMA: "สำนักงานมาตรฐานผลิตภัณฑ์อุตสาหกรรม"}],
 | 
					    "สคบ.": [{ORTH: "สคบ."}],
 | 
				
			||||||
    "ยธ.": [{ORTH: "ยธ.", LEMMA: "กรมโยธาธิการ"}],
 | 
					    "สจร.": [{ORTH: "สจร."}],
 | 
				
			||||||
    "รพช.": [{ORTH: "รพช.", LEMMA: "สำนักงานเร่งรัดพัฒนาชนบท"}],
 | 
					    "สตง.": [{ORTH: "สตง."}],
 | 
				
			||||||
    "รฟท.": [{ORTH: "รฟท.", LEMMA: "การรถไฟแห่งประเทศไทย"}],
 | 
					    "สทท.": [{ORTH: "สทท."}],
 | 
				
			||||||
    "รฟม.": [{ORTH: "รฟม.", LEMMA: "การรถไฟฟ้าขนส่งมวลชนแห่งประเทศไทย"}],
 | 
					    "สทร.": [{ORTH: "สทร."}],
 | 
				
			||||||
    "ศธ.": [{ORTH: "ศธ.", LEMMA: "กระทรวงศึกษาธิการ"}],
 | 
					    "สธ": [{ORTH: "สธ"}],
 | 
				
			||||||
    "ศนธ.": [{ORTH: "ศนธ.", LEMMA: "ศูนย์กลางนิสิตนักศึกษาแห่งประเทศไทย"}],
 | 
					    "สนช.": [{ORTH: "สนช."}],
 | 
				
			||||||
    "สกจ.": [{ORTH: "สกจ.", LEMMA: "สหกรณ์จังหวัด"}],
 | 
					    "สนนท.": [{ORTH: "สนนท."}],
 | 
				
			||||||
    "สกท.": [{ORTH: "สกท.", LEMMA: "สำนักงานคณะกรรมการส่งเสริมการลงทุน"}],
 | 
					    "สปก.": [{ORTH: "สปก."}],
 | 
				
			||||||
    "สกว.": [{ORTH: "สกว.", LEMMA: "สำนักงานกองทุนสนับสนุนการวิจัย"}],
 | 
					    "สปช.": [{ORTH: "สปช."}],
 | 
				
			||||||
    "สคบ.": [{ORTH: "สคบ.", LEMMA: "สำนักงานคณะกรรมการคุ้มครองผู้บริโภค"}],
 | 
					    "สปอ.": [{ORTH: "สปอ."}],
 | 
				
			||||||
    "สจร.": [{ORTH: "สจร.", LEMMA: "สำนักงานคณะกรรมการจัดระบบการจราจรทางบก"}],
 | 
					    "สพช.": [{ORTH: "สพช."}],
 | 
				
			||||||
    "สตง.": [{ORTH: "สตง.", LEMMA: "สำนักงานตรวจเงินแผ่นดิน"}],
 | 
					    "สยช.": [{ORTH: "สยช."}],
 | 
				
			||||||
    "สทท.": [{ORTH: "สทท.", LEMMA: "สถานีวิทยุโทรทัศน์แห่งประเทศไทย"}],
 | 
					    "สวช.": [{ORTH: "สวช."}],
 | 
				
			||||||
    "สทร.": [{ORTH: "สทร.", LEMMA: "สำนักงานกลางทะเบียนราษฎร์"}],
 | 
					    "สวท.": [{ORTH: "สวท."}],
 | 
				
			||||||
    "สธ": [{ORTH: "สธ", LEMMA: "กระทรวงสาธารณสุข"}],
 | 
					    "สวทช.": [{ORTH: "สวทช."}],
 | 
				
			||||||
    "สนช.": [{ORTH: "สนช.", LEMMA: "สภานิติบัญญัติแห่งชาติ,สำนักงานนวัตกรรมแห่งชาติ"}],
 | 
					    "สคช.": [{ORTH: "สคช."}],
 | 
				
			||||||
    "สนนท.": [{ORTH: "สนนท.", LEMMA: "สหพันธ์นิสิตนักศึกษาแห่งประเทศไทย"}],
 | 
					    "สสว.": [{ORTH: "สสว."}],
 | 
				
			||||||
    "สปก.": [{ORTH: "สปก.", LEMMA: "สำนักงานการปฏิรูปที่ดินเพื่อเกษตรกรรม"}],
 | 
					    "สสส.": [{ORTH: "สสส."}],
 | 
				
			||||||
    "สปช.": [{ORTH: "สปช.", LEMMA: "สำนักงานคณะกรรมการการประถมศึกษาแห่งชาติ"}],
 | 
					    "สสวท.": [{ORTH: "สสวท."}],
 | 
				
			||||||
    "สปอ.": [{ORTH: "สปอ.", LEMMA: "สำนักงานการประถมศึกษาอำเภอ"}],
 | 
					    "อตก.": [{ORTH: "อตก."}],
 | 
				
			||||||
    "สพช.": [{ORTH: "สพช.", LEMMA: "สำนักงานคณะกรรมการนโยบายพลังงานแห่งชาติ"}],
 | 
					    "อบจ.": [{ORTH: "อบจ."}],
 | 
				
			||||||
    "สยช.": [
 | 
					    "อบต.": [{ORTH: "อบต."}],
 | 
				
			||||||
        {ORTH: "สยช.", LEMMA: "สำนักงานคณะกรรมการส่งเสริมและประสานงานเยาวชนแห่งชาติ"}
 | 
					    "อปพร.": [{ORTH: "อปพร."}],
 | 
				
			||||||
    ],
 | 
					    "อย.": [{ORTH: "อย."}],
 | 
				
			||||||
    "สวช.": [{ORTH: "สวช.", LEMMA: "สำนักงานคณะกรรมการวัฒนธรรมแห่งชาติ"}],
 | 
					    "อ.ส.ม.ท.": [{ORTH: "อ.ส.ม.ท."}],
 | 
				
			||||||
    "สวท.": [{ORTH: "สวท.", LEMMA: "สถานีวิทยุกระจายเสียงแห่งประเทศไทย"}],
 | 
					 | 
				
			||||||
    "สวทช.": [{ORTH: "สวทช.", LEMMA: "สำนักงานพัฒนาวิทยาศาสตร์และเทคโนโลยีแห่งชาติ"}],
 | 
					 | 
				
			||||||
    "สคช.": [
 | 
					 | 
				
			||||||
        {ORTH: "สคช.", LEMMA: "สำนักงานคณะกรรมการพัฒนาการเศรษฐกิจและสังคมแห่งชาติ"}
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    "สสว.": [{ORTH: "สสว.", LEMMA: "สำนักงานส่งเสริมวิสาหกิจขนาดกลางและขนาดย่อม"}],
 | 
					 | 
				
			||||||
    "สสส.": [{ORTH: "สสส.", LEMMA: "สำนักงานกองทุนสนับสนุนการสร้างเสริมสุขภาพ"}],
 | 
					 | 
				
			||||||
    "สสวท.": [{ORTH: "สสวท.", LEMMA: "สถาบันส่งเสริมการสอนวิทยาศาสตร์และเทคโนโลยี"}],
 | 
					 | 
				
			||||||
    "อตก.": [{ORTH: "อตก.", LEMMA: "องค์การตลาดเพื่อเกษตรกร"}],
 | 
					 | 
				
			||||||
    "อบจ.": [{ORTH: "อบจ.", LEMMA: "องค์การบริหารส่วนจังหวัด"}],
 | 
					 | 
				
			||||||
    "อบต.": [{ORTH: "อบต.", LEMMA: "องค์การบริหารส่วนตำบล"}],
 | 
					 | 
				
			||||||
    "อปพร.": [{ORTH: "อปพร.", LEMMA: "อาสาสมัครป้องกันภัยฝ่ายพลเรือน"}],
 | 
					 | 
				
			||||||
    "อย.": [{ORTH: "อย.", LEMMA: "สำนักงานคณะกรรมการอาหารและยา"}],
 | 
					 | 
				
			||||||
    "อ.ส.ม.ท.": [{ORTH: "อ.ส.ม.ท.", LEMMA: "องค์การสื่อสารมวลชนแห่งประเทศไทย"}],
 | 
					 | 
				
			||||||
    # มหาวิทยาลัย / สถานศึกษา / university / college
 | 
					    # มหาวิทยาลัย / สถานศึกษา / university / college
 | 
				
			||||||
    "มทส.": [{ORTH: "มทส.", LEMMA: "มหาวิทยาลัยเทคโนโลยีสุรนารี"}],
 | 
					    "มทส.": [{ORTH: "มทส."}],
 | 
				
			||||||
    "มธ.": [{ORTH: "มธ.", LEMMA: "มหาวิทยาลัยธรรมศาสตร์"}],
 | 
					    "มธ.": [{ORTH: "มธ."}],
 | 
				
			||||||
    "ม.อ.": [{ORTH: "ม.อ.", LEMMA: "มหาวิทยาลัยสงขลานครินทร์"}],
 | 
					    "ม.อ.": [{ORTH: "ม.อ."}],
 | 
				
			||||||
    "มทร.": [{ORTH: "มทร.", LEMMA: "มหาวิทยาลัยเทคโนโลยีราชมงคล"}],
 | 
					    "มทร.": [{ORTH: "มทร."}],
 | 
				
			||||||
    "มมส.": [{ORTH: "มมส.", LEMMA: "มหาวิทยาลัยมหาสารคาม"}],
 | 
					    "มมส.": [{ORTH: "มมส."}],
 | 
				
			||||||
    "วท.": [{ORTH: "วท.", LEMMA: "วิทยาลัยเทคนิค"}],
 | 
					    "วท.": [{ORTH: "วท."}],
 | 
				
			||||||
    "สตม.": [{ORTH: "สตม.", LEMMA: "สำนักงานตรวจคนเข้าเมือง (ตำรวจ)"}],
 | 
					    "สตม.": [{ORTH: "สตม."}],
 | 
				
			||||||
    # ยศ / rank
 | 
					    # ยศ / rank
 | 
				
			||||||
    "ดร.": [{ORTH: "ดร.", LEMMA: "ดอกเตอร์"}],
 | 
					    "ดร.": [{ORTH: "ดร."}],
 | 
				
			||||||
    "ด.ต.": [{ORTH: "ด.ต.", LEMMA: "ดาบตำรวจ"}],
 | 
					    "ด.ต.": [{ORTH: "ด.ต."}],
 | 
				
			||||||
    "จ.ต.": [{ORTH: "จ.ต.", LEMMA: "จ่าตรี"}],
 | 
					    "จ.ต.": [{ORTH: "จ.ต."}],
 | 
				
			||||||
    "จ.ท.": [{ORTH: "จ.ท.", LEMMA: "จ่าโท"}],
 | 
					    "จ.ท.": [{ORTH: "จ.ท."}],
 | 
				
			||||||
    "จ.ส.ต.": [{ORTH: "จ.ส.ต.", LEMMA: "จ่าสิบตรี (ทหารบก)"}],
 | 
					    "จ.ส.ต.": [{ORTH: "จ.ส.ต."}],
 | 
				
			||||||
    "จสต.": [{ORTH: "จสต.", LEMMA: "จ่าสิบตำรวจ"}],
 | 
					    "จสต.": [{ORTH: "จสต."}],
 | 
				
			||||||
    "จ.ส.ท.": [{ORTH: "จ.ส.ท.", LEMMA: "จ่าสิบโท"}],
 | 
					    "จ.ส.ท.": [{ORTH: "จ.ส.ท."}],
 | 
				
			||||||
    "จ.ส.อ.": [{ORTH: "จ.ส.อ.", LEMMA: "จ่าสิบเอก"}],
 | 
					    "จ.ส.อ.": [{ORTH: "จ.ส.อ."}],
 | 
				
			||||||
    "จ.อ.": [{ORTH: "จ.อ.", LEMMA: "จ่าเอก"}],
 | 
					    "จ.อ.": [{ORTH: "จ.อ."}],
 | 
				
			||||||
    "ทพญ.": [{ORTH: "ทพญ.", LEMMA: "ทันตแพทย์หญิง"}],
 | 
					    "ทพญ.": [{ORTH: "ทพญ."}],
 | 
				
			||||||
    "ทนพ.": [{ORTH: "ทนพ.", LEMMA: "เทคนิคการแพทย์"}],
 | 
					    "ทนพ.": [{ORTH: "ทนพ."}],
 | 
				
			||||||
    "นจอ.": [{ORTH: "นจอ.", LEMMA: "นักเรียนจ่าอากาศ"}],
 | 
					    "นจอ.": [{ORTH: "นจอ."}],
 | 
				
			||||||
    "น.ช.": [{ORTH: "น.ช.", LEMMA: "นักโทษชาย"}],
 | 
					    "น.ช.": [{ORTH: "น.ช."}],
 | 
				
			||||||
    "น.ญ.": [{ORTH: "น.ญ.", LEMMA: "นักโทษหญิง"}],
 | 
					    "น.ญ.": [{ORTH: "น.ญ."}],
 | 
				
			||||||
    "น.ต.": [{ORTH: "น.ต.", LEMMA: "นาวาตรี"}],
 | 
					    "น.ต.": [{ORTH: "น.ต."}],
 | 
				
			||||||
    "น.ท.": [{ORTH: "น.ท.", LEMMA: "นาวาโท"}],
 | 
					    "น.ท.": [{ORTH: "น.ท."}],
 | 
				
			||||||
    "นตท.": [{ORTH: "นตท.", LEMMA: "นักเรียนเตรียมทหาร"}],
 | 
					    "นตท.": [{ORTH: "นตท."}],
 | 
				
			||||||
    "นนส.": [{ORTH: "นนส.", LEMMA: "นักเรียนนายสิบทหารบก"}],
 | 
					    "นนส.": [{ORTH: "นนส."}],
 | 
				
			||||||
    "นนร.": [{ORTH: "นนร.", LEMMA: "นักเรียนนายร้อย"}],
 | 
					    "นนร.": [{ORTH: "นนร."}],
 | 
				
			||||||
    "นนอ.": [{ORTH: "นนอ.", LEMMA: "นักเรียนนายเรืออากาศ"}],
 | 
					    "นนอ.": [{ORTH: "นนอ."}],
 | 
				
			||||||
    "นพ.": [{ORTH: "นพ.", LEMMA: "นายแพทย์"}],
 | 
					    "นพ.": [{ORTH: "นพ."}],
 | 
				
			||||||
    "นพท.": [{ORTH: "นพท.", LEMMA: "นายแพทย์ทหาร"}],
 | 
					    "นพท.": [{ORTH: "นพท."}],
 | 
				
			||||||
    "นรจ.": [{ORTH: "นรจ.", LEMMA: "นักเรียนจ่าทหารเรือ"}],
 | 
					    "นรจ.": [{ORTH: "นรจ."}],
 | 
				
			||||||
    "นรต.": [{ORTH: "นรต.", LEMMA: "นักเรียนนายร้อยตำรวจ"}],
 | 
					    "นรต.": [{ORTH: "นรต."}],
 | 
				
			||||||
    "นศพ.": [{ORTH: "นศพ.", LEMMA: "นักศึกษาแพทย์"}],
 | 
					    "นศพ.": [{ORTH: "นศพ."}],
 | 
				
			||||||
    "นศท.": [{ORTH: "นศท.", LEMMA: "นักศึกษาวิชาทหาร"}],
 | 
					    "นศท.": [{ORTH: "นศท."}],
 | 
				
			||||||
    "น.สพ.": [{ORTH: "น.สพ.", LEMMA: "นายสัตวแพทย์ (พ.ร.บ.วิชาชีพการสัตวแพทย์)"}],
 | 
					    "น.สพ.": [{ORTH: "น.สพ."}],
 | 
				
			||||||
    "น.อ.": [{ORTH: "น.อ.", LEMMA: "นาวาเอก"}],
 | 
					    "น.อ.": [{ORTH: "น.อ."}],
 | 
				
			||||||
    "บช.ก.": [{ORTH: "บช.ก.", LEMMA: "กองบัญชาการตำรวจสอบสวนกลาง"}],
 | 
					    "บช.ก.": [{ORTH: "บช.ก."}],
 | 
				
			||||||
    "บช.น.": [{ORTH: "บช.น.", LEMMA: "กองบัญชาการตำรวจนครบาล"}],
 | 
					    "บช.น.": [{ORTH: "บช.น."}],
 | 
				
			||||||
    "ผกก.": [{ORTH: "ผกก.", LEMMA: "ผู้กำกับการ"}],
 | 
					    "ผกก.": [{ORTH: "ผกก."}],
 | 
				
			||||||
    "ผกก.ภ.": [{ORTH: "ผกก.ภ.", LEMMA: "ผู้กำกับการตำรวจภูธร"}],
 | 
					    "ผกก.ภ.": [{ORTH: "ผกก.ภ."}],
 | 
				
			||||||
    "ผจก.": [{ORTH: "ผจก.", LEMMA: "ผู้จัดการ"}],
 | 
					    "ผจก.": [{ORTH: "ผจก."}],
 | 
				
			||||||
    "ผช.": [{ORTH: "ผช.", LEMMA: "ผู้ช่วย"}],
 | 
					    "ผช.": [{ORTH: "ผช."}],
 | 
				
			||||||
    "ผชก.": [{ORTH: "ผชก.", LEMMA: "ผู้ชำนาญการ"}],
 | 
					    "ผชก.": [{ORTH: "ผชก."}],
 | 
				
			||||||
    "ผช.ผอ.": [{ORTH: "ผช.ผอ.", LEMMA: "ผู้ช่วยผู้อำนวยการ"}],
 | 
					    "ผช.ผอ.": [{ORTH: "ผช.ผอ."}],
 | 
				
			||||||
    "ผญบ.": [{ORTH: "ผญบ.", LEMMA: "ผู้ใหญ่บ้าน"}],
 | 
					    "ผญบ.": [{ORTH: "ผญบ."}],
 | 
				
			||||||
    "ผบ.": [{ORTH: "ผบ.", LEMMA: "ผู้บังคับบัญชา"}],
 | 
					    "ผบ.": [{ORTH: "ผบ."}],
 | 
				
			||||||
    "ผบก.": [{ORTH: "ผบก.", LEMMA: "ผู้บังคับบัญชาการ (ตำรวจ)"}],
 | 
					    "ผบก.": [{ORTH: "ผบก."}],
 | 
				
			||||||
    "ผบก.น.": [{ORTH: "ผบก.น.", LEMMA: "ผู้บังคับการตำรวจนครบาล"}],
 | 
					    "ผบก.น.": [{ORTH: "ผบก.น."}],
 | 
				
			||||||
    "ผบก.ป.": [{ORTH: "ผบก.ป.", LEMMA: "ผู้บังคับการตำรวจกองปราบปราม"}],
 | 
					    "ผบก.ป.": [{ORTH: "ผบก.ป."}],
 | 
				
			||||||
    "ผบก.ปค.": [
 | 
					    "ผบก.ปค.": [{ORTH: "ผบก.ปค."}],
 | 
				
			||||||
        {
 | 
					    "ผบก.ปม.": [{ORTH: "ผบก.ปม."}],
 | 
				
			||||||
            ORTH: "ผบก.ปค.",
 | 
					    "ผบก.ภ.": [{ORTH: "ผบก.ภ."}],
 | 
				
			||||||
            LEMMA: "ผู้บังคับการ กองบังคับการปกครอง (โรงเรียนนายร้อยตำรวจ)",
 | 
					    "ผบช.": [{ORTH: "ผบช."}],
 | 
				
			||||||
        }
 | 
					    "ผบช.ก.": [{ORTH: "ผบช.ก."}],
 | 
				
			||||||
    ],
 | 
					    "ผบช.ตชด.": [{ORTH: "ผบช.ตชด."}],
 | 
				
			||||||
    "ผบก.ปม.": [{ORTH: "ผบก.ปม.", LEMMA: "ผู้บังคับการตำรวจป่าไม้"}],
 | 
					    "ผบช.น.": [{ORTH: "ผบช.น."}],
 | 
				
			||||||
    "ผบก.ภ.": [{ORTH: "ผบก.ภ.", LEMMA: "ผู้บังคับการตำรวจภูธร"}],
 | 
					    "ผบช.ภ.": [{ORTH: "ผบช.ภ."}],
 | 
				
			||||||
    "ผบช.": [{ORTH: "ผบช.", LEMMA: "ผู้บัญชาการ (ตำรวจ)"}],
 | 
					    "ผบ.ทบ.": [{ORTH: "ผบ.ทบ."}],
 | 
				
			||||||
    "ผบช.ก.": [{ORTH: "ผบช.ก.", LEMMA: "ผู้บัญชาการตำรวจสอบสวนกลาง"}],
 | 
					    "ผบ.ตร.": [{ORTH: "ผบ.ตร."}],
 | 
				
			||||||
    "ผบช.ตชด.": [{ORTH: "ผบช.ตชด.", LEMMA: "ผู้บัญชาการตำรวจตระเวนชายแดน"}],
 | 
					    "ผบ.ทร.": [{ORTH: "ผบ.ทร."}],
 | 
				
			||||||
    "ผบช.น.": [{ORTH: "ผบช.น.", LEMMA: "ผู้บัญชาการตำรวจนครบาล"}],
 | 
					    "ผบ.ทอ.": [{ORTH: "ผบ.ทอ."}],
 | 
				
			||||||
    "ผบช.ภ.": [{ORTH: "ผบช.ภ.", LEMMA: "ผู้บัญชาการตำรวจภูธร"}],
 | 
					    "ผบ.ทสส.": [{ORTH: "ผบ.ทสส."}],
 | 
				
			||||||
    "ผบ.ทบ.": [{ORTH: "ผบ.ทบ.", LEMMA: "ผู้บัญชาการทหารบก"}],
 | 
					    "ผวจ.": [{ORTH: "ผวจ."}],
 | 
				
			||||||
    "ผบ.ตร.": [{ORTH: "ผบ.ตร.", LEMMA: "ผู้บัญชาการตำรวจแห่งชาติ"}],
 | 
					    "ผู้ว่าฯ": [{ORTH: "ผู้ว่าฯ"}],
 | 
				
			||||||
    "ผบ.ทร.": [{ORTH: "ผบ.ทร.", LEMMA: "ผู้บัญชาการทหารเรือ"}],
 | 
					    "พ.จ.ต.": [{ORTH: "พ.จ.ต."}],
 | 
				
			||||||
    "ผบ.ทอ.": [{ORTH: "ผบ.ทอ.", LEMMA: "ผู้บัญชาการทหารอากาศ"}],
 | 
					    "พ.จ.ท.": [{ORTH: "พ.จ.ท."}],
 | 
				
			||||||
    "ผบ.ทสส.": [{ORTH: "ผบ.ทสส.", LEMMA: "ผู้บัญชาการทหารสูงสุด"}],
 | 
					    "พ.จ.อ.": [{ORTH: "พ.จ.อ."}],
 | 
				
			||||||
    "ผวจ.": [{ORTH: "ผวจ.", LEMMA: "ผู้ว่าราชการจังหวัด"}],
 | 
					    "พญ.": [{ORTH: "พญ."}],
 | 
				
			||||||
    "ผู้ว่าฯ": [{ORTH: "ผู้ว่าฯ", LEMMA: "ผู้ว่าราชการจังหวัด"}],
 | 
					    "ฯพณฯ": [{ORTH: "ฯพณฯ"}],
 | 
				
			||||||
    "พ.จ.ต.": [{ORTH: "พ.จ.ต.", LEMMA: "พันจ่าตรี"}],
 | 
					    "พ.ต.": [{ORTH: "พ.ต."}],
 | 
				
			||||||
    "พ.จ.ท.": [{ORTH: "พ.จ.ท.", LEMMA: "พันจ่าโท"}],
 | 
					    "พ.ท.": [{ORTH: "พ.ท."}],
 | 
				
			||||||
    "พ.จ.อ.": [{ORTH: "พ.จ.อ.", LEMMA: "พันจ่าเอก"}],
 | 
					    "พ.อ.": [{ORTH: "พ.อ."}],
 | 
				
			||||||
    "พญ.": [{ORTH: "พญ.", LEMMA: "แพทย์หญิง"}],
 | 
					    "พ.ต.อ.พิเศษ": [{ORTH: "พ.ต.อ.พิเศษ"}],
 | 
				
			||||||
    "ฯพณฯ": [{ORTH: "ฯพณฯ", LEMMA: "พณท่าน"}],
 | 
					    "พลฯ": [{ORTH: "พลฯ"}],
 | 
				
			||||||
    "พ.ต.": [{ORTH: "พ.ต.", LEMMA: "พันตรี"}],
 | 
					    "พล.๑ รอ.": [{ORTH: "พล.๑ รอ."}],
 | 
				
			||||||
    "พ.ท.": [{ORTH: "พ.ท.", LEMMA: "พันโท"}],
 | 
					    "พล.ต.": [{ORTH: "พล.ต."}],
 | 
				
			||||||
    "พ.อ.": [{ORTH: "พ.อ.", LEMMA: "พันเอก"}],
 | 
					    "พล.ต.ต.": [{ORTH: "พล.ต.ต."}],
 | 
				
			||||||
    "พ.ต.อ.พิเศษ": [{ORTH: "พ.ต.อ.พิเศษ", LEMMA: "พันตำรวจเอกพิเศษ"}],
 | 
					    "พล.ต.ท.": [{ORTH: "พล.ต.ท."}],
 | 
				
			||||||
    "พลฯ": [{ORTH: "พลฯ", LEMMA: "พลทหาร"}],
 | 
					    "พล.ต.อ.": [{ORTH: "พล.ต.อ."}],
 | 
				
			||||||
    "พล.๑ รอ.": [{ORTH: "พล.๑ รอ.", LEMMA: "กองพลที่ ๑ รักษาพระองค์ กองทัพบก"}],
 | 
					    "พล.ท.": [{ORTH: "พล.ท."}],
 | 
				
			||||||
    "พล.ต.": [{ORTH: "พล.ต.", LEMMA: "พลตรี"}],
 | 
					    "พล.ปตอ.": [{ORTH: "พล.ปตอ."}],
 | 
				
			||||||
    "พล.ต.ต.": [{ORTH: "พล.ต.ต.", LEMMA: "พลตำรวจตรี"}],
 | 
					    "พล.ม.": [{ORTH: "พล.ม."}],
 | 
				
			||||||
    "พล.ต.ท.": [{ORTH: "พล.ต.ท.", LEMMA: "พลตำรวจโท"}],
 | 
					    "พล.ม.๒": [{ORTH: "พล.ม.๒"}],
 | 
				
			||||||
    "พล.ต.อ.": [{ORTH: "พล.ต.อ.", LEMMA: "พลตำรวจเอก"}],
 | 
					    "พล.ร.ต.": [{ORTH: "พล.ร.ต."}],
 | 
				
			||||||
    "พล.ท.": [{ORTH: "พล.ท.", LEMMA: "พลโท"}],
 | 
					    "พล.ร.ท.": [{ORTH: "พล.ร.ท."}],
 | 
				
			||||||
    "พล.ปตอ.": [{ORTH: "พล.ปตอ.", LEMMA: "กองพลทหารปืนใหญ่ต่อสู่อากาศยาน"}],
 | 
					    "พล.ร.อ.": [{ORTH: "พล.ร.อ."}],
 | 
				
			||||||
    "พล.ม.": [{ORTH: "พล.ม.", LEMMA: "กองพลทหารม้า"}],
 | 
					    "พล.อ.": [{ORTH: "พล.อ."}],
 | 
				
			||||||
    "พล.ม.๒": [{ORTH: "พล.ม.๒", LEMMA: "กองพลทหารม้าที่ ๒"}],
 | 
					    "พล.อ.ต.": [{ORTH: "พล.อ.ต."}],
 | 
				
			||||||
    "พล.ร.ต.": [{ORTH: "พล.ร.ต.", LEMMA: "พลเรือตรี"}],
 | 
					    "พล.อ.ท.": [{ORTH: "พล.อ.ท."}],
 | 
				
			||||||
    "พล.ร.ท.": [{ORTH: "พล.ร.ท.", LEMMA: "พลเรือโท"}],
 | 
					    "พล.อ.อ.": [{ORTH: "พล.อ.อ."}],
 | 
				
			||||||
    "พล.ร.อ.": [{ORTH: "พล.ร.อ.", LEMMA: "พลเรือเอก"}],
 | 
					    "พ.อ.พิเศษ": [{ORTH: "พ.อ.พิเศษ"}],
 | 
				
			||||||
    "พล.อ.": [{ORTH: "พล.อ.", LEMMA: "พลเอก"}],
 | 
					    "พ.อ.ต.": [{ORTH: "พ.อ.ต."}],
 | 
				
			||||||
    "พล.อ.ต.": [{ORTH: "พล.อ.ต.", LEMMA: "พลอากาศตรี"}],
 | 
					    "พ.อ.ท.": [{ORTH: "พ.อ.ท."}],
 | 
				
			||||||
    "พล.อ.ท.": [{ORTH: "พล.อ.ท.", LEMMA: "พลอากาศโท"}],
 | 
					    "พ.อ.อ.": [{ORTH: "พ.อ.อ."}],
 | 
				
			||||||
    "พล.อ.อ.": [{ORTH: "พล.อ.อ.", LEMMA: "พลอากาศเอก"}],
 | 
					    "ภกญ.": [{ORTH: "ภกญ."}],
 | 
				
			||||||
    "พ.อ.พิเศษ": [{ORTH: "พ.อ.พิเศษ", LEMMA: "พันเอกพิเศษ"}],
 | 
					    "ม.จ.": [{ORTH: "ม.จ."}],
 | 
				
			||||||
    "พ.อ.ต.": [{ORTH: "พ.อ.ต.", LEMMA: "พันจ่าอากาศตรี"}],
 | 
					    "มท1": [{ORTH: "มท1"}],
 | 
				
			||||||
    "พ.อ.ท.": [{ORTH: "พ.อ.ท.", LEMMA: "พันจ่าอากาศโท"}],
 | 
					    "ม.ร.ว.": [{ORTH: "ม.ร.ว."}],
 | 
				
			||||||
    "พ.อ.อ.": [{ORTH: "พ.อ.อ.", LEMMA: "พันจ่าอากาศเอก"}],
 | 
					    "มล.": [{ORTH: "มล."}],
 | 
				
			||||||
    "ภกญ.": [{ORTH: "ภกญ.", LEMMA: "เภสัชกรหญิง"}],
 | 
					    "ร.ต.": [{ORTH: "ร.ต."}],
 | 
				
			||||||
    "ม.จ.": [{ORTH: "ม.จ.", LEMMA: "หม่อมเจ้า"}],
 | 
					    "ร.ต.ต.": [{ORTH: "ร.ต.ต."}],
 | 
				
			||||||
    "มท1": [{ORTH: "มท1", LEMMA: "รัฐมนตรีว่าการกระทรวงมหาดไทย"}],
 | 
					    "ร.ต.ท.": [{ORTH: "ร.ต.ท."}],
 | 
				
			||||||
    "ม.ร.ว.": [{ORTH: "ม.ร.ว.", LEMMA: "หม่อมราชวงศ์"}],
 | 
					    "ร.ต.อ.": [{ORTH: "ร.ต.อ."}],
 | 
				
			||||||
    "มล.": [{ORTH: "มล.", LEMMA: "หม่อมหลวง"}],
 | 
					    "ร.ท.": [{ORTH: "ร.ท."}],
 | 
				
			||||||
    "ร.ต.": [{ORTH: "ร.ต.", LEMMA: "ร้อยตรี,เรือตรี,เรืออากาศตรี"}],
 | 
					    "รมช.": [{ORTH: "รมช."}],
 | 
				
			||||||
    "ร.ต.ต.": [{ORTH: "ร.ต.ต.", LEMMA: "ร้อยตำรวจตรี"}],
 | 
					    "รมต.": [{ORTH: "รมต."}],
 | 
				
			||||||
    "ร.ต.ท.": [{ORTH: "ร.ต.ท.", LEMMA: "ร้อยตำรวจโท"}],
 | 
					    "รมว.": [{ORTH: "รมว."}],
 | 
				
			||||||
    "ร.ต.อ.": [{ORTH: "ร.ต.อ.", LEMMA: "ร้อยตำรวจเอก"}],
 | 
					    "รศ.": [{ORTH: "รศ."}],
 | 
				
			||||||
    "ร.ท.": [{ORTH: "ร.ท.", LEMMA: "ร้อยโท,เรือโท,เรืออากาศโท"}],
 | 
					    "ร.อ.": [{ORTH: "ร.อ."}],
 | 
				
			||||||
    "รมช.": [{ORTH: "รมช.", LEMMA: "รัฐมนตรีช่วยว่าการกระทรวง"}],
 | 
					    "ศ.": [{ORTH: "ศ."}],
 | 
				
			||||||
    "รมต.": [{ORTH: "รมต.", LEMMA: "รัฐมนตรี"}],
 | 
					    "ส.ต.": [{ORTH: "ส.ต."}],
 | 
				
			||||||
    "รมว.": [{ORTH: "รมว.", LEMMA: "รัฐมนตรีว่าการกระทรวง"}],
 | 
					    "ส.ต.ต.": [{ORTH: "ส.ต.ต."}],
 | 
				
			||||||
    "รศ.": [{ORTH: "รศ.", LEMMA: "รองศาสตราจารย์"}],
 | 
					    "ส.ต.ท.": [{ORTH: "ส.ต.ท."}],
 | 
				
			||||||
    "ร.อ.": [{ORTH: "ร.อ.", LEMMA: "ร้อยเอก,เรือเอก,เรืออากาศเอก"}],
 | 
					    "ส.ต.อ.": [{ORTH: "ส.ต.อ."}],
 | 
				
			||||||
    "ศ.": [{ORTH: "ศ.", LEMMA: "ศาสตราจารย์"}],
 | 
					    "ส.ท.": [{ORTH: "ส.ท."}],
 | 
				
			||||||
    "ส.ต.": [{ORTH: "ส.ต.", LEMMA: "สิบตรี"}],
 | 
					    "สพ.": [{ORTH: "สพ."}],
 | 
				
			||||||
    "ส.ต.ต.": [{ORTH: "ส.ต.ต.", LEMMA: "สิบตำรวจตรี"}],
 | 
					    "สพ.ญ.": [{ORTH: "สพ.ญ."}],
 | 
				
			||||||
    "ส.ต.ท.": [{ORTH: "ส.ต.ท.", LEMMA: "สิบตำรวจโท"}],
 | 
					    "สพ.ช.": [{ORTH: "สพ.ช."}],
 | 
				
			||||||
    "ส.ต.อ.": [{ORTH: "ส.ต.อ.", LEMMA: "สิบตำรวจเอก"}],
 | 
					    "ส.อ.": [{ORTH: "ส.อ."}],
 | 
				
			||||||
    "ส.ท.": [{ORTH: "ส.ท.", LEMMA: "สิบโท"}],
 | 
					    "อจ.": [{ORTH: "อจ."}],
 | 
				
			||||||
    "สพ.": [{ORTH: "สพ.", LEMMA: "สัตวแพทย์"}],
 | 
					    "อจญ.": [{ORTH: "อจญ."}],
 | 
				
			||||||
    "สพ.ญ.": [{ORTH: "สพ.ญ.", LEMMA: "สัตวแพทย์หญิง"}],
 | 
					 | 
				
			||||||
    "สพ.ช.": [{ORTH: "สพ.ช.", LEMMA: "สัตวแพทย์ชาย"}],
 | 
					 | 
				
			||||||
    "ส.อ.": [{ORTH: "ส.อ.", LEMMA: "สิบเอก"}],
 | 
					 | 
				
			||||||
    "อจ.": [{ORTH: "อจ.", LEMMA: "อาจารย์"}],
 | 
					 | 
				
			||||||
    "อจญ.": [{ORTH: "อจญ.", LEMMA: "อาจารย์ใหญ่"}],
 | 
					 | 
				
			||||||
    # วุฒิ / bachelor degree
 | 
					    # วุฒิ / bachelor degree
 | 
				
			||||||
    "ป.": [{ORTH: "ป.", LEMMA: "ประถมศึกษา"}],
 | 
					    "ป.": [{ORTH: "ป."}],
 | 
				
			||||||
    "ป.กศ.": [{ORTH: "ป.กศ.", LEMMA: "ประกาศนียบัตรวิชาการศึกษา"}],
 | 
					    "ป.กศ.": [{ORTH: "ป.กศ."}],
 | 
				
			||||||
    "ป.กศ.สูง": [{ORTH: "ป.กศ.สูง", LEMMA: "ประกาศนียบัตรวิชาการศึกษาชั้นสูง"}],
 | 
					    "ป.กศ.สูง": [{ORTH: "ป.กศ.สูง"}],
 | 
				
			||||||
    "ปวช.": [{ORTH: "ปวช.", LEMMA: "ประกาศนียบัตรวิชาชีพ"}],
 | 
					    "ปวช.": [{ORTH: "ปวช."}],
 | 
				
			||||||
    "ปวท.": [{ORTH: "ปวท.", LEMMA: "ประกาศนียบัตรวิชาชีพเทคนิค"}],
 | 
					    "ปวท.": [{ORTH: "ปวท."}],
 | 
				
			||||||
    "ปวส.": [{ORTH: "ปวส.", LEMMA: "ประกาศนียบัตรวิชาชีพชั้นสูง"}],
 | 
					    "ปวส.": [{ORTH: "ปวส."}],
 | 
				
			||||||
    "ปทส.": [{ORTH: "ปทส.", LEMMA: "ประกาศนียบัตรครูเทคนิคชั้นสูง"}],
 | 
					    "ปทส.": [{ORTH: "ปทส."}],
 | 
				
			||||||
    "กษ.บ.": [{ORTH: "กษ.บ.", LEMMA: "เกษตรศาสตรบัณฑิต"}],
 | 
					    "กษ.บ.": [{ORTH: "กษ.บ."}],
 | 
				
			||||||
    "กษ.ม.": [{ORTH: "กษ.ม.", LEMMA: "เกษตรศาสตรมหาบัณฑิต"}],
 | 
					    "กษ.ม.": [{ORTH: "กษ.ม."}],
 | 
				
			||||||
    "กษ.ด.": [{ORTH: "กษ.ด.", LEMMA: "เกษตรศาสตรดุษฎีบัณฑิต"}],
 | 
					    "กษ.ด.": [{ORTH: "กษ.ด."}],
 | 
				
			||||||
    "ค.บ.": [{ORTH: "ค.บ.", LEMMA: "ครุศาสตรบัณฑิต"}],
 | 
					    "ค.บ.": [{ORTH: "ค.บ."}],
 | 
				
			||||||
    "คศ.บ.": [{ORTH: "คศ.บ.", LEMMA: "คหกรรมศาสตรบัณฑิต"}],
 | 
					    "คศ.บ.": [{ORTH: "คศ.บ."}],
 | 
				
			||||||
    "คศ.ม.": [{ORTH: "คศ.ม.", LEMMA: "คหกรรมศาสตรมหาบัณฑิต"}],
 | 
					    "คศ.ม.": [{ORTH: "คศ.ม."}],
 | 
				
			||||||
    "คศ.ด.": [{ORTH: "คศ.ด.", LEMMA: "คหกรรมศาสตรดุษฎีบัณฑิต"}],
 | 
					    "คศ.ด.": [{ORTH: "คศ.ด."}],
 | 
				
			||||||
    "ค.อ.บ.": [{ORTH: "ค.อ.บ.", LEMMA: "ครุศาสตรอุตสาหกรรมบัณฑิต"}],
 | 
					    "ค.อ.บ.": [{ORTH: "ค.อ.บ."}],
 | 
				
			||||||
    "ค.อ.ม.": [{ORTH: "ค.อ.ม.", LEMMA: "ครุศาสตรอุตสาหกรรมมหาบัณฑิต"}],
 | 
					    "ค.อ.ม.": [{ORTH: "ค.อ.ม."}],
 | 
				
			||||||
    "ค.อ.ด.": [{ORTH: "ค.อ.ด.", LEMMA: "ครุศาสตรอุตสาหกรรมดุษฎีบัณฑิต"}],
 | 
					    "ค.อ.ด.": [{ORTH: "ค.อ.ด."}],
 | 
				
			||||||
    "ทก.บ.": [{ORTH: "ทก.บ.", LEMMA: "เทคโนโลยีการเกษตรบัณฑิต"}],
 | 
					    "ทก.บ.": [{ORTH: "ทก.บ."}],
 | 
				
			||||||
    "ทก.ม.": [{ORTH: "ทก.ม.", LEMMA: "เทคโนโลยีการเกษตรมหาบัณฑิต"}],
 | 
					    "ทก.ม.": [{ORTH: "ทก.ม."}],
 | 
				
			||||||
    "ทก.ด.": [{ORTH: "ทก.ด.", LEMMA: "เทคโนโลยีการเกษตรดุษฎีบัณฑิต"}],
 | 
					    "ทก.ด.": [{ORTH: "ทก.ด."}],
 | 
				
			||||||
    "ท.บ.": [{ORTH: "ท.บ.", LEMMA: "ทันตแพทยศาสตรบัณฑิต"}],
 | 
					    "ท.บ.": [{ORTH: "ท.บ."}],
 | 
				
			||||||
    "ท.ม.": [{ORTH: "ท.ม.", LEMMA: "ทันตแพทยศาสตรมหาบัณฑิต"}],
 | 
					    "ท.ม.": [{ORTH: "ท.ม."}],
 | 
				
			||||||
    "ท.ด.": [{ORTH: "ท.ด.", LEMMA: "ทันตแพทยศาสตรดุษฎีบัณฑิต"}],
 | 
					    "ท.ด.": [{ORTH: "ท.ด."}],
 | 
				
			||||||
    "น.บ.": [{ORTH: "น.บ.", LEMMA: "นิติศาสตรบัณฑิต"}],
 | 
					    "น.บ.": [{ORTH: "น.บ."}],
 | 
				
			||||||
    "น.ม.": [{ORTH: "น.ม.", LEMMA: "นิติศาสตรมหาบัณฑิต"}],
 | 
					    "น.ม.": [{ORTH: "น.ม."}],
 | 
				
			||||||
    "น.ด.": [{ORTH: "น.ด.", LEMMA: "นิติศาสตรดุษฎีบัณฑิต"}],
 | 
					    "น.ด.": [{ORTH: "น.ด."}],
 | 
				
			||||||
    "นศ.บ.": [{ORTH: "นศ.บ.", LEMMA: "นิเทศศาสตรบัณฑิต"}],
 | 
					    "นศ.บ.": [{ORTH: "นศ.บ."}],
 | 
				
			||||||
    "นศ.ม.": [{ORTH: "นศ.ม.", LEMMA: "นิเทศศาสตรมหาบัณฑิต"}],
 | 
					    "นศ.ม.": [{ORTH: "นศ.ม."}],
 | 
				
			||||||
    "นศ.ด.": [{ORTH: "นศ.ด.", LEMMA: "นิเทศศาสตรดุษฎีบัณฑิต"}],
 | 
					    "นศ.ด.": [{ORTH: "นศ.ด."}],
 | 
				
			||||||
    "บช.บ.": [{ORTH: "บช.บ.", LEMMA: "บัญชีบัณฑิต"}],
 | 
					    "บช.บ.": [{ORTH: "บช.บ."}],
 | 
				
			||||||
    "บช.ม.": [{ORTH: "บช.ม.", LEMMA: "บัญชีมหาบัณฑิต"}],
 | 
					    "บช.ม.": [{ORTH: "บช.ม."}],
 | 
				
			||||||
    "บช.ด.": [{ORTH: "บช.ด.", LEMMA: "บัญชีดุษฎีบัณฑิต"}],
 | 
					    "บช.ด.": [{ORTH: "บช.ด."}],
 | 
				
			||||||
    "บธ.บ.": [{ORTH: "บธ.บ.", LEMMA: "บริหารธุรกิจบัณฑิต"}],
 | 
					    "บธ.บ.": [{ORTH: "บธ.บ."}],
 | 
				
			||||||
    "บธ.ม.": [{ORTH: "บธ.ม.", LEMMA: "บริหารธุรกิจมหาบัณฑิต"}],
 | 
					    "บธ.ม.": [{ORTH: "บธ.ม."}],
 | 
				
			||||||
    "บธ.ด.": [{ORTH: "บธ.ด.", LEMMA: "บริหารธุรกิจดุษฎีบัณฑิต"}],
 | 
					    "บธ.ด.": [{ORTH: "บธ.ด."}],
 | 
				
			||||||
    "พณ.บ.": [{ORTH: "พณ.บ.", LEMMA: "พาณิชยศาสตรบัณฑิต"}],
 | 
					    "พณ.บ.": [{ORTH: "พณ.บ."}],
 | 
				
			||||||
    "พณ.ม.": [{ORTH: "พณ.ม.", LEMMA: "พาณิชยศาสตรมหาบัณฑิต"}],
 | 
					    "พณ.ม.": [{ORTH: "พณ.ม."}],
 | 
				
			||||||
    "พณ.ด.": [{ORTH: "พณ.ด.", LEMMA: "พาณิชยศาสตรดุษฎีบัณฑิต"}],
 | 
					    "พณ.ด.": [{ORTH: "พณ.ด."}],
 | 
				
			||||||
    "พ.บ.": [{ORTH: "พ.บ.", LEMMA: "แพทยศาสตรบัณฑิต"}],
 | 
					    "พ.บ.": [{ORTH: "พ.บ."}],
 | 
				
			||||||
    "พ.ม.": [{ORTH: "พ.ม.", LEMMA: "แพทยศาสตรมหาบัณฑิต"}],
 | 
					    "พ.ม.": [{ORTH: "พ.ม."}],
 | 
				
			||||||
    "พ.ด.": [{ORTH: "พ.ด.", LEMMA: "แพทยศาสตรดุษฎีบัณฑิต"}],
 | 
					    "พ.ด.": [{ORTH: "พ.ด."}],
 | 
				
			||||||
    "พธ.บ.": [{ORTH: "พธ.บ.", LEMMA: "พุทธศาสตรบัณฑิต"}],
 | 
					    "พธ.บ.": [{ORTH: "พธ.บ."}],
 | 
				
			||||||
    "พธ.ม.": [{ORTH: "พธ.ม.", LEMMA: "พุทธศาสตรมหาบัณฑิต"}],
 | 
					    "พธ.ม.": [{ORTH: "พธ.ม."}],
 | 
				
			||||||
    "พธ.ด.": [{ORTH: "พธ.ด.", LEMMA: "พุทธศาสตรดุษฎีบัณฑิต"}],
 | 
					    "พธ.ด.": [{ORTH: "พธ.ด."}],
 | 
				
			||||||
    "พบ.บ.": [{ORTH: "พบ.บ.", LEMMA: "พัฒนบริหารศาสตรบัณฑิต"}],
 | 
					    "พบ.บ.": [{ORTH: "พบ.บ."}],
 | 
				
			||||||
    "พบ.ม.": [{ORTH: "พบ.ม.", LEMMA: "พัฒนบริหารศาสตรมหาบัณฑิต"}],
 | 
					    "พบ.ม.": [{ORTH: "พบ.ม."}],
 | 
				
			||||||
    "พบ.ด.": [{ORTH: "พบ.ด.", LEMMA: "พัฒนบริหารศาสตรดุษฎีบัณฑิต"}],
 | 
					    "พบ.ด.": [{ORTH: "พบ.ด."}],
 | 
				
			||||||
    "พย.บ.": [{ORTH: "พย.บ.", LEMMA: "พยาบาลศาสตรดุษฎีบัณฑิต"}],
 | 
					    "พย.บ.": [{ORTH: "พย.บ."}],
 | 
				
			||||||
    "พย.ม.": [{ORTH: "พย.ม.", LEMMA: "พยาบาลศาสตรมหาบัณฑิต"}],
 | 
					    "พย.ม.": [{ORTH: "พย.ม."}],
 | 
				
			||||||
    "พย.ด.": [{ORTH: "พย.ด.", LEMMA: "พยาบาลศาสตรดุษฎีบัณฑิต"}],
 | 
					    "พย.ด.": [{ORTH: "พย.ด."}],
 | 
				
			||||||
    "พศ.บ.": [{ORTH: "พศ.บ.", LEMMA: "พาณิชยศาสตรบัณฑิต"}],
 | 
					    "พศ.บ.": [{ORTH: "พศ.บ."}],
 | 
				
			||||||
    "พศ.ม.": [{ORTH: "พศ.ม.", LEMMA: "พาณิชยศาสตรมหาบัณฑิต"}],
 | 
					    "พศ.ม.": [{ORTH: "พศ.ม."}],
 | 
				
			||||||
    "พศ.ด.": [{ORTH: "พศ.ด.", LEMMA: "พาณิชยศาสตรดุษฎีบัณฑิต"}],
 | 
					    "พศ.ด.": [{ORTH: "พศ.ด."}],
 | 
				
			||||||
    "ภ.บ.": [{ORTH: "ภ.บ.", LEMMA: "เภสัชศาสตรบัณฑิต"}],
 | 
					    "ภ.บ.": [{ORTH: "ภ.บ."}],
 | 
				
			||||||
    "ภ.ม.": [{ORTH: "ภ.ม.", LEMMA: "เภสัชศาสตรมหาบัณฑิต"}],
 | 
					    "ภ.ม.": [{ORTH: "ภ.ม."}],
 | 
				
			||||||
    "ภ.ด.": [{ORTH: "ภ.ด.", LEMMA: "เภสัชศาสตรดุษฎีบัณฑิต"}],
 | 
					    "ภ.ด.": [{ORTH: "ภ.ด."}],
 | 
				
			||||||
    "ภ.สถ.บ.": [{ORTH: "ภ.สถ.บ.", LEMMA: "ภูมิสถาปัตยกรรมศาสตรบัณฑิต"}],
 | 
					    "ภ.สถ.บ.": [{ORTH: "ภ.สถ.บ."}],
 | 
				
			||||||
    "รป.บ.": [{ORTH: "รป.บ.", LEMMA: "รัฐประศาสนศาสตร์บัณฑิต"}],
 | 
					    "รป.บ.": [{ORTH: "รป.บ."}],
 | 
				
			||||||
    "รป.ม.": [{ORTH: "รป.ม.", LEMMA: "รัฐประศาสนศาสตร์มหาบัณฑิต"}],
 | 
					    "รป.ม.": [{ORTH: "รป.ม."}],
 | 
				
			||||||
    "วท.บ.": [{ORTH: "วท.บ.", LEMMA: "วิทยาศาสตรบัณฑิต"}],
 | 
					    "วท.บ.": [{ORTH: "วท.บ."}],
 | 
				
			||||||
    "วท.ม.": [{ORTH: "วท.ม.", LEMMA: "วิทยาศาสตรมหาบัณฑิต"}],
 | 
					    "วท.ม.": [{ORTH: "วท.ม."}],
 | 
				
			||||||
    "วท.ด.": [{ORTH: "วท.ด.", LEMMA: "วิทยาศาสตรดุษฎีบัณฑิต"}],
 | 
					    "วท.ด.": [{ORTH: "วท.ด."}],
 | 
				
			||||||
    "ศ.บ.": [{ORTH: "ศ.บ.", LEMMA: "ศิลปบัณฑิต"}],
 | 
					    "ศ.บ.": [{ORTH: "ศ.บ."}],
 | 
				
			||||||
    "ศศ.บ.": [{ORTH: "ศศ.บ.", LEMMA: "ศิลปศาสตรบัณฑิต"}],
 | 
					    "ศศ.บ.": [{ORTH: "ศศ.บ."}],
 | 
				
			||||||
    "ศษ.บ.": [{ORTH: "ศษ.บ.", LEMMA: "ศึกษาศาสตรบัณฑิต"}],
 | 
					    "ศษ.บ.": [{ORTH: "ศษ.บ."}],
 | 
				
			||||||
    "ศส.บ.": [{ORTH: "ศส.บ.", LEMMA: "เศรษฐศาสตรบัณฑิต"}],
 | 
					    "ศส.บ.": [{ORTH: "ศส.บ."}],
 | 
				
			||||||
    "สถ.บ.": [{ORTH: "สถ.บ.", LEMMA: "สถาปัตยกรรมศาสตรบัณฑิต"}],
 | 
					    "สถ.บ.": [{ORTH: "สถ.บ."}],
 | 
				
			||||||
    "สถ.ม.": [{ORTH: "สถ.ม.", LEMMA: "สถาปัตยกรรมศาสตรมหาบัณฑิต"}],
 | 
					    "สถ.ม.": [{ORTH: "สถ.ม."}],
 | 
				
			||||||
    "สถ.ด.": [{ORTH: "สถ.ด.", LEMMA: "สถาปัตยกรรมศาสตรดุษฎีบัณฑิต"}],
 | 
					    "สถ.ด.": [{ORTH: "สถ.ด."}],
 | 
				
			||||||
    "สพ.บ.": [{ORTH: "สพ.บ.", LEMMA: "สัตวแพทยศาสตรบัณฑิต"}],
 | 
					    "สพ.บ.": [{ORTH: "สพ.บ."}],
 | 
				
			||||||
    "อ.บ.": [{ORTH: "อ.บ.", LEMMA: "อักษรศาสตรบัณฑิต"}],
 | 
					    "อ.บ.": [{ORTH: "อ.บ."}],
 | 
				
			||||||
    "อ.ม.": [{ORTH: "อ.ม.", LEMMA: "อักษรศาสตรมหาบัณฑิต"}],
 | 
					    "อ.ม.": [{ORTH: "อ.ม."}],
 | 
				
			||||||
    "อ.ด.": [{ORTH: "อ.ด.", LEMMA: "อักษรศาสตรดุษฎีบัณฑิต"}],
 | 
					    "อ.ด.": [{ORTH: "อ.ด."}],
 | 
				
			||||||
    # ปี / เวลา / year / time
 | 
					    # ปี / เวลา / year / time
 | 
				
			||||||
    "ชม.": [{ORTH: "ชม.", LEMMA: "ชั่วโมง"}],
 | 
					    "ชม.": [{ORTH: "ชม."}],
 | 
				
			||||||
    "จ.ศ.": [{ORTH: "จ.ศ.", LEMMA: "จุลศักราช"}],
 | 
					    "จ.ศ.": [{ORTH: "จ.ศ."}],
 | 
				
			||||||
    "ค.ศ.": [{ORTH: "ค.ศ.", LEMMA: "คริสต์ศักราช"}],
 | 
					    "ค.ศ.": [{ORTH: "ค.ศ."}],
 | 
				
			||||||
    "ฮ.ศ.": [{ORTH: "ฮ.ศ.", LEMMA: "ฮิจเราะห์ศักราช"}],
 | 
					    "ฮ.ศ.": [{ORTH: "ฮ.ศ."}],
 | 
				
			||||||
    "ว.ด.ป.": [{ORTH: "ว.ด.ป.", LEMMA: "วัน เดือน ปี"}],
 | 
					    "ว.ด.ป.": [{ORTH: "ว.ด.ป."}],
 | 
				
			||||||
    # ระยะทาง / distance
 | 
					    # ระยะทาง / distance
 | 
				
			||||||
    "ฮม.": [{ORTH: "ฮม.", LEMMA: "เฮกโตเมตร"}],
 | 
					    "ฮม.": [{ORTH: "ฮม."}],
 | 
				
			||||||
    "ดคม.": [{ORTH: "ดคม.", LEMMA: "เดคาเมตร"}],
 | 
					    "ดคม.": [{ORTH: "ดคม."}],
 | 
				
			||||||
    "ดม.": [{ORTH: "ดม.", LEMMA: "เดซิเมตร"}],
 | 
					    "ดม.": [{ORTH: "ดม."}],
 | 
				
			||||||
    "มม.": [{ORTH: "มม.", LEMMA: "มิลลิเมตร"}],
 | 
					    "มม.": [{ORTH: "มม."}],
 | 
				
			||||||
    "ซม.": [{ORTH: "ซม.", LEMMA: "เซนติเมตร"}],
 | 
					    "ซม.": [{ORTH: "ซม."}],
 | 
				
			||||||
    "กม.": [{ORTH: "กม.", LEMMA: "กิโลเมตร"}],
 | 
					    "กม.": [{ORTH: "กม."}],
 | 
				
			||||||
    # น้ำหนัก / weight
 | 
					    # น้ำหนัก / weight
 | 
				
			||||||
    "น.น.": [{ORTH: "น.น.", LEMMA: "น้ำหนัก"}],
 | 
					    "น.น.": [{ORTH: "น.น."}],
 | 
				
			||||||
    "ฮก.": [{ORTH: "ฮก.", LEMMA: "เฮกโตกรัม"}],
 | 
					    "ฮก.": [{ORTH: "ฮก."}],
 | 
				
			||||||
    "ดคก.": [{ORTH: "ดคก.", LEMMA: "เดคากรัม"}],
 | 
					    "ดคก.": [{ORTH: "ดคก."}],
 | 
				
			||||||
    "ดก.": [{ORTH: "ดก.", LEMMA: "เดซิกรัม"}],
 | 
					    "ดก.": [{ORTH: "ดก."}],
 | 
				
			||||||
    "ซก.": [{ORTH: "ซก.", LEMMA: "เซนติกรัม"}],
 | 
					    "ซก.": [{ORTH: "ซก."}],
 | 
				
			||||||
    "มก.": [{ORTH: "มก.", LEMMA: "มิลลิกรัม"}],
 | 
					    "มก.": [{ORTH: "มก."}],
 | 
				
			||||||
    "ก.": [{ORTH: "ก.", LEMMA: "กรัม"}],
 | 
					    "ก.": [{ORTH: "ก."}],
 | 
				
			||||||
    "กก.": [{ORTH: "กก.", LEMMA: "กิโลกรัม"}],
 | 
					    "กก.": [{ORTH: "กก."}],
 | 
				
			||||||
    # ปริมาตร / volume
 | 
					    # ปริมาตร / volume
 | 
				
			||||||
    "ฮล.": [{ORTH: "ฮล.", LEMMA: "เฮกโตลิตร"}],
 | 
					    "ฮล.": [{ORTH: "ฮล."}],
 | 
				
			||||||
    "ดคล.": [{ORTH: "ดคล.", LEMMA: "เดคาลิตร"}],
 | 
					    "ดคล.": [{ORTH: "ดคล."}],
 | 
				
			||||||
    "ดล.": [{ORTH: "ดล.", LEMMA: "เดซิลิตร"}],
 | 
					    "ดล.": [{ORTH: "ดล."}],
 | 
				
			||||||
    "ซล.": [{ORTH: "ซล.", LEMMA: "เซนติลิตร"}],
 | 
					    "ซล.": [{ORTH: "ซล."}],
 | 
				
			||||||
    "ล.": [{ORTH: "ล.", LEMMA: "ลิตร"}],
 | 
					    "ล.": [{ORTH: "ล."}],
 | 
				
			||||||
    "กล.": [{ORTH: "กล.", LEMMA: "กิโลลิตร"}],
 | 
					    "กล.": [{ORTH: "กล."}],
 | 
				
			||||||
    "ลบ.": [{ORTH: "ลบ.", LEMMA: "ลูกบาศก์"}],
 | 
					    "ลบ.": [{ORTH: "ลบ."}],
 | 
				
			||||||
    # พื้นที่ / area
 | 
					    # พื้นที่ / area
 | 
				
			||||||
    "ตร.ซม.": [{ORTH: "ตร.ซม.", LEMMA: "ตารางเซนติเมตร"}],
 | 
					    "ตร.ซม.": [{ORTH: "ตร.ซม."}],
 | 
				
			||||||
    "ตร.ม.": [{ORTH: "ตร.ม.", LEMMA: "ตารางเมตร"}],
 | 
					    "ตร.ม.": [{ORTH: "ตร.ม."}],
 | 
				
			||||||
    "ตร.ว.": [{ORTH: "ตร.ว.", LEMMA: "ตารางวา"}],
 | 
					    "ตร.ว.": [{ORTH: "ตร.ว."}],
 | 
				
			||||||
    "ตร.กม.": [{ORTH: "ตร.กม.", LEMMA: "ตารางกิโลเมตร"}],
 | 
					    "ตร.กม.": [{ORTH: "ตร.กม."}],
 | 
				
			||||||
    # เดือน / month
 | 
					    # เดือน / month
 | 
				
			||||||
    "ม.ค.": [{ORTH: "ม.ค.", LEMMA: "มกราคม"}],
 | 
					    "ม.ค.": [{ORTH: "ม.ค."}],
 | 
				
			||||||
    "ก.พ.": [{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}],
 | 
					    "ก.พ.": [{ORTH: "ก.พ."}],
 | 
				
			||||||
    "มี.ค.": [{ORTH: "มี.ค.", LEMMA: "มีนาคม"}],
 | 
					    "มี.ค.": [{ORTH: "มี.ค."}],
 | 
				
			||||||
    "เม.ย.": [{ORTH: "เม.ย.", LEMMA: "เมษายน"}],
 | 
					    "เม.ย.": [{ORTH: "เม.ย."}],
 | 
				
			||||||
    "พ.ค.": [{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}],
 | 
					    "พ.ค.": [{ORTH: "พ.ค."}],
 | 
				
			||||||
    "มิ.ย.": [{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}],
 | 
					    "มิ.ย.": [{ORTH: "มิ.ย."}],
 | 
				
			||||||
    "ก.ค.": [{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}],
 | 
					    "ก.ค.": [{ORTH: "ก.ค."}],
 | 
				
			||||||
    "ส.ค.": [{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}],
 | 
					    "ส.ค.": [{ORTH: "ส.ค."}],
 | 
				
			||||||
    "ก.ย.": [{ORTH: "ก.ย.", LEMMA: "กันยายน"}],
 | 
					    "ก.ย.": [{ORTH: "ก.ย."}],
 | 
				
			||||||
    "ต.ค.": [{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}],
 | 
					    "ต.ค.": [{ORTH: "ต.ค."}],
 | 
				
			||||||
    "พ.ย.": [{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}],
 | 
					    "พ.ย.": [{ORTH: "พ.ย."}],
 | 
				
			||||||
    "ธ.ค.": [{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}],
 | 
					    "ธ.ค.": [{ORTH: "ธ.ค."}],
 | 
				
			||||||
    # เพศ / gender
 | 
					    # เพศ / gender
 | 
				
			||||||
    "ช.": [{ORTH: "ช.", LEMMA: "ชาย"}],
 | 
					    "ช.": [{ORTH: "ช."}],
 | 
				
			||||||
    "ญ.": [{ORTH: "ญ.", LEMMA: "หญิง"}],
 | 
					    "ญ.": [{ORTH: "ญ."}],
 | 
				
			||||||
    "ด.ช.": [{ORTH: "ด.ช.", LEMMA: "เด็กชาย"}],
 | 
					    "ด.ช.": [{ORTH: "ด.ช."}],
 | 
				
			||||||
    "ด.ญ.": [{ORTH: "ด.ญ.", LEMMA: "เด็กหญิง"}],
 | 
					    "ด.ญ.": [{ORTH: "ด.ญ."}],
 | 
				
			||||||
    # ที่อยู่ / address
 | 
					    # ที่อยู่ / address
 | 
				
			||||||
    "ถ.": [{ORTH: "ถ.", LEMMA: "ถนน"}],
 | 
					    "ถ.": [{ORTH: "ถ."}],
 | 
				
			||||||
    "ต.": [{ORTH: "ต.", LEMMA: "ตำบล"}],
 | 
					    "ต.": [{ORTH: "ต."}],
 | 
				
			||||||
    "อ.": [{ORTH: "อ.", LEMMA: "อำเภอ"}],
 | 
					    "อ.": [{ORTH: "อ."}],
 | 
				
			||||||
    "จ.": [{ORTH: "จ.", LEMMA: "จังหวัด"}],
 | 
					    "จ.": [{ORTH: "จ."}],
 | 
				
			||||||
    # สรรพนาม / pronoun
 | 
					    # สรรพนาม / pronoun
 | 
				
			||||||
    "ข้าฯ": [{ORTH: "ข้าฯ", LEMMA: "ข้าพระพุทธเจ้า"}],
 | 
					    "ข้าฯ": [{ORTH: "ข้าฯ"}],
 | 
				
			||||||
    "ทูลเกล้าฯ": [{ORTH: "ทูลเกล้าฯ", LEMMA: "ทูลเกล้าทูลกระหม่อม"}],
 | 
					    "ทูลเกล้าฯ": [{ORTH: "ทูลเกล้าฯ"}],
 | 
				
			||||||
    "น้อมเกล้าฯ": [{ORTH: "น้อมเกล้าฯ", LEMMA: "น้อมเกล้าน้อมกระหม่อม"}],
 | 
					    "น้อมเกล้าฯ": [{ORTH: "น้อมเกล้าฯ"}],
 | 
				
			||||||
    "โปรดเกล้าฯ": [{ORTH: "โปรดเกล้าฯ", LEMMA: "โปรดเกล้าโปรดกระหม่อม"}],
 | 
					    "โปรดเกล้าฯ": [{ORTH: "โปรดเกล้าฯ"}],
 | 
				
			||||||
    # การเมือง / politic
 | 
					    # การเมือง / politic
 | 
				
			||||||
    "ขจก.": [{ORTH: "ขจก.", LEMMA: "ขบวนการโจรก่อการร้าย"}],
 | 
					    "ขจก.": [{ORTH: "ขจก."}],
 | 
				
			||||||
    "ขบด.": [{ORTH: "ขบด.", LEMMA: "ขบวนการแบ่งแยกดินแดน"}],
 | 
					    "ขบด.": [{ORTH: "ขบด."}],
 | 
				
			||||||
    "นปช.": [{ORTH: "นปช.", LEMMA: "แนวร่วมประชาธิปไตยขับไล่เผด็จการ"}],
 | 
					    "นปช.": [{ORTH: "นปช."}],
 | 
				
			||||||
    "ปชป.": [{ORTH: "ปชป.", LEMMA: "พรรคประชาธิปัตย์"}],
 | 
					    "ปชป.": [{ORTH: "ปชป."}],
 | 
				
			||||||
    "ผกค.": [{ORTH: "ผกค.", LEMMA: "ผู้ก่อการร้ายคอมมิวนิสต์"}],
 | 
					    "ผกค.": [{ORTH: "ผกค."}],
 | 
				
			||||||
    "พท.": [{ORTH: "พท.", LEMMA: "พรรคเพื่อไทย"}],
 | 
					    "พท.": [{ORTH: "พท."}],
 | 
				
			||||||
    "พ.ร.ก.": [{ORTH: "พ.ร.ก.", LEMMA: "พระราชกำหนด"}],
 | 
					    "พ.ร.ก.": [{ORTH: "พ.ร.ก."}],
 | 
				
			||||||
    "พ.ร.ฎ.": [{ORTH: "พ.ร.ฎ.", LEMMA: "พระราชกฤษฎีกา"}],
 | 
					    "พ.ร.ฎ.": [{ORTH: "พ.ร.ฎ."}],
 | 
				
			||||||
    "พ.ร.บ.": [{ORTH: "พ.ร.บ.", LEMMA: "พระราชบัญญัติ"}],
 | 
					    "พ.ร.บ.": [{ORTH: "พ.ร.บ."}],
 | 
				
			||||||
    "รธน.": [{ORTH: "รธน.", LEMMA: "รัฐธรรมนูญ"}],
 | 
					    "รธน.": [{ORTH: "รธน."}],
 | 
				
			||||||
    "รบ.": [{ORTH: "รบ.", LEMMA: "รัฐบาล"}],
 | 
					    "รบ.": [{ORTH: "รบ."}],
 | 
				
			||||||
    "รสช.": [{ORTH: "รสช.", LEMMA: "คณะรักษาความสงบเรียบร้อยแห่งชาติ"}],
 | 
					    "รสช.": [{ORTH: "รสช."}],
 | 
				
			||||||
    "ส.ก.": [{ORTH: "ส.ก.", LEMMA: "สมาชิกสภากรุงเทพมหานคร"}],
 | 
					    "ส.ก.": [{ORTH: "ส.ก."}],
 | 
				
			||||||
    "สจ.": [{ORTH: "สจ.", LEMMA: "สมาชิกสภาจังหวัด"}],
 | 
					    "สจ.": [{ORTH: "สจ."}],
 | 
				
			||||||
    "สว.": [{ORTH: "สว.", LEMMA: "สมาชิกวุฒิสภา"}],
 | 
					    "สว.": [{ORTH: "สว."}],
 | 
				
			||||||
    "ส.ส.": [{ORTH: "ส.ส.", LEMMA: "สมาชิกสภาผู้แทนราษฎร"}],
 | 
					    "ส.ส.": [{ORTH: "ส.ส."}],
 | 
				
			||||||
    # ทั่วไป / general
 | 
					    # ทั่วไป / general
 | 
				
			||||||
    "ก.ข.ค.": [{ORTH: "ก.ข.ค.", LEMMA: "ก้างขวางคอ"}],
 | 
					    "ก.ข.ค.": [{ORTH: "ก.ข.ค."}],
 | 
				
			||||||
    "กทม.": [{ORTH: "กทม.", LEMMA: "กรุงเทพมหานคร"}],
 | 
					    "กทม.": [{ORTH: "กทม."}],
 | 
				
			||||||
    "กรุงเทพฯ": [{ORTH: "กรุงเทพฯ", LEMMA: "กรุงเทพมหานคร"}],
 | 
					    "กรุงเทพฯ": [{ORTH: "กรุงเทพฯ"}],
 | 
				
			||||||
    "ขรก.": [{ORTH: "ขรก.", LEMMA: "ข้าราชการ"}],
 | 
					    "ขรก.": [{ORTH: "ขรก."}],
 | 
				
			||||||
    "ขส": [{ORTH: "ขส.", LEMMA: "ขนส่ง"}],
 | 
					    "ขส": [{ORTH: "ขส."}],
 | 
				
			||||||
    "ค.ร.น.": [{ORTH: "ค.ร.น.", LEMMA: "คูณร่วมน้อย"}],
 | 
					    "ค.ร.น.": [{ORTH: "ค.ร.น."}],
 | 
				
			||||||
    "ค.ร.ม.": [{ORTH: "ค.ร.ม.", LEMMA: "คูณร่วมมาก"}],
 | 
					    "ค.ร.ม.": [{ORTH: "ค.ร.ม."}],
 | 
				
			||||||
    "ง.ด.": [{ORTH: "ง.ด.", LEMMA: "เงินเดือน"}],
 | 
					    "ง.ด.": [{ORTH: "ง.ด."}],
 | 
				
			||||||
    "งป.": [{ORTH: "งป.", LEMMA: "งบประมาณ"}],
 | 
					    "งป.": [{ORTH: "งป."}],
 | 
				
			||||||
    "จก.": [{ORTH: "จก.", LEMMA: "จำกัด"}],
 | 
					    "จก.": [{ORTH: "จก."}],
 | 
				
			||||||
    "จขกท.": [{ORTH: "จขกท.", LEMMA: "เจ้าของกระทู้"}],
 | 
					    "จขกท.": [{ORTH: "จขกท."}],
 | 
				
			||||||
    "จนท.": [{ORTH: "จนท.", LEMMA: "เจ้าหน้าที่"}],
 | 
					    "จนท.": [{ORTH: "จนท."}],
 | 
				
			||||||
    "จ.ป.ร.": [
 | 
					    "จ.ป.ร.": [{ORTH: "จ.ป.ร."}],
 | 
				
			||||||
        {
 | 
					    "จ.ม.": [{ORTH: "จ.ม."}],
 | 
				
			||||||
            ORTH: "จ.ป.ร.",
 | 
					    "จย.": [{ORTH: "จย."}],
 | 
				
			||||||
            LEMMA: "มหาจุฬาลงกรณ ปรมราชาธิราช (พระปรมาภิไธยในพระบาทสมเด็จพระจุลจอมเกล้าเจ้าอยู่หัว)",
 | 
					    "จยย.": [{ORTH: "จยย."}],
 | 
				
			||||||
        }
 | 
					    "ตจว.": [{ORTH: "ตจว."}],
 | 
				
			||||||
    ],
 | 
					    "โทร.": [{ORTH: "โทร."}],
 | 
				
			||||||
    "จ.ม.": [{ORTH: "จ.ม.", LEMMA: "จดหมาย"}],
 | 
					    "ธ.": [{ORTH: "ธ."}],
 | 
				
			||||||
    "จย.": [{ORTH: "จย.", LEMMA: "จักรยาน"}],
 | 
					    "น.ร.": [{ORTH: "น.ร."}],
 | 
				
			||||||
    "จยย.": [{ORTH: "จยย.", LEMMA: "จักรยานยนต์"}],
 | 
					    "น.ศ.": [{ORTH: "น.ศ."}],
 | 
				
			||||||
    "ตจว.": [{ORTH: "ตจว.", LEMMA: "ต่างจังหวัด"}],
 | 
					    "น.ส.": [{ORTH: "น.ส."}],
 | 
				
			||||||
    "โทร.": [{ORTH: "โทร.", LEMMA: "โทรศัพท์"}],
 | 
					    "น.ส.๓": [{ORTH: "น.ส.๓"}],
 | 
				
			||||||
    "ธ.": [{ORTH: "ธ.", LEMMA: "ธนาคาร"}],
 | 
					    "น.ส.๓ ก.": [{ORTH: "น.ส.๓ ก"}],
 | 
				
			||||||
    "น.ร.": [{ORTH: "น.ร.", LEMMA: "นักเรียน"}],
 | 
					    "นสพ.": [{ORTH: "นสพ."}],
 | 
				
			||||||
    "น.ศ.": [{ORTH: "น.ศ.", LEMMA: "นักศึกษา"}],
 | 
					    "บ.ก.": [{ORTH: "บ.ก."}],
 | 
				
			||||||
    "น.ส.": [{ORTH: "น.ส.", LEMMA: "นางสาว"}],
 | 
					    "บจก.": [{ORTH: "บจก."}],
 | 
				
			||||||
    "น.ส.๓": [{ORTH: "น.ส.๓", LEMMA: "หนังสือรับรองการทำประโยชน์ในที่ดิน"}],
 | 
					    "บงล.": [{ORTH: "บงล."}],
 | 
				
			||||||
    "น.ส.๓ ก.": [
 | 
					    "บบส.": [{ORTH: "บบส."}],
 | 
				
			||||||
        {ORTH: "น.ส.๓ ก", LEMMA: "หนังสือแสดงกรรมสิทธิ์ในที่ดิน (มีระวางกำหนด)"}
 | 
					    "บมจ.": [{ORTH: "บมจ."}],
 | 
				
			||||||
    ],
 | 
					    "บลจ.": [{ORTH: "บลจ."}],
 | 
				
			||||||
    "นสพ.": [{ORTH: "นสพ.", LEMMA: "หนังสือพิมพ์"}],
 | 
					    "บ/ช": [{ORTH: "บ/ช"}],
 | 
				
			||||||
    "บ.ก.": [{ORTH: "บ.ก.", LEMMA: "บรรณาธิการ"}],
 | 
					    "บร.": [{ORTH: "บร."}],
 | 
				
			||||||
    "บจก.": [{ORTH: "บจก.", LEMMA: "บริษัทจำกัด"}],
 | 
					    "ปชช.": [{ORTH: "ปชช."}],
 | 
				
			||||||
    "บงล.": [{ORTH: "บงล.", LEMMA: "บริษัทเงินทุนและหลักทรัพย์จำกัด"}],
 | 
					    "ปณ.": [{ORTH: "ปณ."}],
 | 
				
			||||||
    "บบส.": [{ORTH: "บบส.", LEMMA: "บรรษัทบริหารสินทรัพย์สถาบันการเงิน"}],
 | 
					    "ปณก.": [{ORTH: "ปณก."}],
 | 
				
			||||||
    "บมจ.": [{ORTH: "บมจ.", LEMMA: "บริษัทมหาชนจำกัด"}],
 | 
					    "ปณส.": [{ORTH: "ปณส."}],
 | 
				
			||||||
    "บลจ.": [{ORTH: "บลจ.", LEMMA: "บริษัทหลักทรัพย์จัดการกองทุนรวมจำกัด"}],
 | 
					    "ปธ.": [{ORTH: "ปธ."}],
 | 
				
			||||||
    "บ/ช": [{ORTH: "บ/ช", LEMMA: "บัญชี"}],
 | 
					    "ปธน.": [{ORTH: "ปธน."}],
 | 
				
			||||||
    "บร.": [{ORTH: "บร.", LEMMA: "บรรณารักษ์"}],
 | 
					    "ปอ.": [{ORTH: "ปอ."}],
 | 
				
			||||||
    "ปชช.": [{ORTH: "ปชช.", LEMMA: "ประชาชน"}],
 | 
					    "ปอ.พ.": [{ORTH: "ปอ.พ."}],
 | 
				
			||||||
    "ปณ.": [{ORTH: "ปณ.", LEMMA: "ที่ทำการไปรษณีย์"}],
 | 
					    "พ.ก.ง.": [{ORTH: "พ.ก.ง."}],
 | 
				
			||||||
    "ปณก.": [{ORTH: "ปณก.", LEMMA: "ที่ทำการไปรษณีย์กลาง"}],
 | 
					    "พ.ก.ส.": [{ORTH: "พ.ก.ส."}],
 | 
				
			||||||
    "ปณส.": [{ORTH: "ปณส.", LEMMA: "ที่ทำการไปรษณีย์สาขา"}],
 | 
					    "พขร.": [{ORTH: "พขร."}],
 | 
				
			||||||
    "ปธ.": [{ORTH: "ปธ.", LEMMA: "ประธาน"}],
 | 
					    "ภ.ง.ด.": [{ORTH: "ภ.ง.ด."}],
 | 
				
			||||||
    "ปธน.": [{ORTH: "ปธน.", LEMMA: "ประธานาธิบดี"}],
 | 
					    "ภ.ง.ด.๙": [{ORTH: "ภ.ง.ด.๙"}],
 | 
				
			||||||
    "ปอ.": [{ORTH: "ปอ.", LEMMA: "รถยนต์โดยสารประจำทางปรับอากาศ"}],
 | 
					    "ภ.ป.ร.": [{ORTH: "ภ.ป.ร."}],
 | 
				
			||||||
    "ปอ.พ.": [{ORTH: "ปอ.พ.", LEMMA: "รถยนต์โดยสารประจำทางปรับอากาศพิเศษ"}],
 | 
					    "ภ.พ.": [{ORTH: "ภ.พ."}],
 | 
				
			||||||
    "พ.ก.ง.": [{ORTH: "พ.ก.ง.", LEMMA: "พัสดุเก็บเงินปลายทาง"}],
 | 
					    "ร.": [{ORTH: "ร."}],
 | 
				
			||||||
    "พ.ก.ส.": [{ORTH: "พ.ก.ส.", LEMMA: "พนักงานเก็บค่าโดยสาร"}],
 | 
					    "ร.ง.": [{ORTH: "ร.ง."}],
 | 
				
			||||||
    "พขร.": [{ORTH: "พขร.", LEMMA: "พนักงานขับรถ"}],
 | 
					    "ร.ด.": [{ORTH: "ร.ด."}],
 | 
				
			||||||
    "ภ.ง.ด.": [{ORTH: "ภ.ง.ด.", LEMMA: "ภาษีเงินได้"}],
 | 
					    "รปภ.": [{ORTH: "รปภ."}],
 | 
				
			||||||
    "ภ.ง.ด.๙": [{ORTH: "ภ.ง.ด.๙", LEMMA: "แบบแสดงรายการเสียภาษีเงินได้ของกรมสรรพากร"}],
 | 
					    "รพ.": [{ORTH: "รพ."}],
 | 
				
			||||||
    "ภ.ป.ร.": [
 | 
					    "ร.พ.": [{ORTH: "ร.พ."}],
 | 
				
			||||||
        {
 | 
					    "รร.": [{ORTH: "รร."}],
 | 
				
			||||||
            ORTH: "ภ.ป.ร.",
 | 
					    "รสก.": [{ORTH: "รสก."}],
 | 
				
			||||||
            LEMMA: "ภูมิพลอดุยเดช ปรมราชาธิราช (พระปรมาภิไธยในพระบาทสมเด็จพระปรมินทรมหาภูมิพลอดุลยเดช)",
 | 
					    "ส.ค.ส.": [{ORTH: "ส.ค.ส."}],
 | 
				
			||||||
        }
 | 
					    "สต.": [{ORTH: "สต."}],
 | 
				
			||||||
    ],
 | 
					    "สน.": [{ORTH: "สน."}],
 | 
				
			||||||
    "ภ.พ.": [{ORTH: "ภ.พ.", LEMMA: "ภาษีมูลค่าเพิ่ม"}],
 | 
					    "สนข.": [{ORTH: "สนข."}],
 | 
				
			||||||
    "ร.": [{ORTH: "ร.", LEMMA: "รัชกาล"}],
 | 
					    "สนง.": [{ORTH: "สนง."}],
 | 
				
			||||||
    "ร.ง.": [{ORTH: "ร.ง.", LEMMA: "โรงงาน"}],
 | 
					    "สนญ.": [{ORTH: "สนญ."}],
 | 
				
			||||||
    "ร.ด.": [{ORTH: "ร.ด.", LEMMA: "รักษาดินแดน"}],
 | 
					    "ส.ป.ช.": [{ORTH: "ส.ป.ช."}],
 | 
				
			||||||
    "รปภ.": [{ORTH: "รปภ.", LEMMA: "รักษาความปลอดภัย"}],
 | 
					    "สภ.": [{ORTH: "สภ."}],
 | 
				
			||||||
    "รพ.": [{ORTH: "รพ.", LEMMA: "โรงพยาบาล"}],
 | 
					    "ส.ล.น.": [{ORTH: "ส.ล.น."}],
 | 
				
			||||||
    "ร.พ.": [{ORTH: "ร.พ.", LEMMA: "โรงพิมพ์"}],
 | 
					    "สวญ.": [{ORTH: "สวญ."}],
 | 
				
			||||||
    "รร.": [{ORTH: "รร.", LEMMA: "โรงเรียน,โรงแรม"}],
 | 
					    "สวป.": [{ORTH: "สวป."}],
 | 
				
			||||||
    "รสก.": [{ORTH: "รสก.", LEMMA: "รัฐวิสาหกิจ"}],
 | 
					    "สว.สส.": [{ORTH: "สว.สส."}],
 | 
				
			||||||
    "ส.ค.ส.": [{ORTH: "ส.ค.ส.", LEMMA: "ส่งความสุขปีใหม่"}],
 | 
					    "ส.ห.": [{ORTH: "ส.ห."}],
 | 
				
			||||||
    "สต.": [{ORTH: "สต.", LEMMA: "สตางค์"}],
 | 
					    "สอ.": [{ORTH: "สอ."}],
 | 
				
			||||||
    "สน.": [{ORTH: "สน.", LEMMA: "สถานีตำรวจ"}],
 | 
					    "สอท.": [{ORTH: "สอท."}],
 | 
				
			||||||
    "สนข.": [{ORTH: "สนข.", LEMMA: "สำนักงานเขต"}],
 | 
					    "เสธ.": [{ORTH: "เสธ."}],
 | 
				
			||||||
    "สนง.": [{ORTH: "สนง.", LEMMA: "สำนักงาน"}],
 | 
					    "หจก.": [{ORTH: "หจก."}],
 | 
				
			||||||
    "สนญ.": [{ORTH: "สนญ.", LEMMA: "สำนักงานใหญ่"}],
 | 
					    "ห.ร.ม.": [{ORTH: "ห.ร.ม."}],
 | 
				
			||||||
    "ส.ป.ช.": [{ORTH: "ส.ป.ช.", LEMMA: "สร้างเสริมประสบการณ์ชีวิต"}],
 | 
					 | 
				
			||||||
    "สภ.": [{ORTH: "สภ.", LEMMA: "สถานีตำรวจภูธร"}],
 | 
					 | 
				
			||||||
    "ส.ล.น.": [{ORTH: "ส.ล.น.", LEMMA: "สร้างเสริมลักษณะนิสัย"}],
 | 
					 | 
				
			||||||
    "สวญ.": [{ORTH: "สวญ.", LEMMA: "สารวัตรใหญ่"}],
 | 
					 | 
				
			||||||
    "สวป.": [{ORTH: "สวป.", LEMMA: "สารวัตรป้องกันปราบปราม"}],
 | 
					 | 
				
			||||||
    "สว.สส.": [{ORTH: "สว.สส.", LEMMA: "สารวัตรสืบสวน"}],
 | 
					 | 
				
			||||||
    "ส.ห.": [{ORTH: "ส.ห.", LEMMA: "สารวัตรทหาร"}],
 | 
					 | 
				
			||||||
    "สอ.": [{ORTH: "สอ.", LEMMA: "สถานีอนามัย"}],
 | 
					 | 
				
			||||||
    "สอท.": [{ORTH: "สอท.", LEMMA: "สถานเอกอัครราชทูต"}],
 | 
					 | 
				
			||||||
    "เสธ.": [{ORTH: "เสธ.", LEMMA: "เสนาธิการ"}],
 | 
					 | 
				
			||||||
    "หจก.": [{ORTH: "หจก.", LEMMA: "ห้างหุ้นส่วนจำกัด"}],
 | 
					 | 
				
			||||||
    "ห.ร.ม.": [{ORTH: "ห.ร.ม.", LEMMA: "ตัวหารร่วมมาก"}],
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user