Merge pull request #5798 from explosion/feature/language-data-config

This commit is contained in:
Ines Montani 2020-07-25 13:34:49 +02:00 committed by GitHub
commit cdbd6ba912
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
139 changed files with 2914 additions and 10822 deletions

View File

@ -1,4 +1,5 @@
[training] [training]
max_steps = 0
patience = 10000 patience = 10000
eval_frequency = 200 eval_frequency = 200
dropout = 0.2 dropout = 0.2
@ -8,13 +9,20 @@ max_epochs = 100
orth_variant_level = 0.0 orth_variant_level = 0.0
gold_preproc = true gold_preproc = true
max_length = 0 max_length = 0
use_gpu = -1
scores = ["tags_acc", "uas", "las"] scores = ["tags_acc", "uas", "las"]
score_weights = {"las": 0.8, "tags_acc": 0.2} score_weights = {"las": 0.8, "tags_acc": 0.2}
limit = 0 limit = 0
seed = 0 seed = 0
accumulate_gradient = 2 accumulate_gradient = 2
discard_oversize = false discard_oversize = false
raw_text = null
tag_map = null
morph_rules = null
base_model = null
eval_batch_size = 128
use_pytorch_for_gpu_memory = false
batch_by = "padded"
[training.batch_size] [training.batch_size]
@schedules = "compounding.v1" @schedules = "compounding.v1"
@ -30,41 +38,48 @@ beta2 = 0.999
[nlp] [nlp]
lang = "en" lang = "en"
vectors = ${training:vectors} pipeline = ["tok2vec", "tagger", "parser"]
load_vocab_data = false
[nlp.pipeline.tok2vec] [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[components]
[components.tok2vec]
factory = "tok2vec" factory = "tok2vec"
[nlp.pipeline.tagger] [components.tagger]
factory = "tagger" factory = "tagger"
[nlp.pipeline.parser] [components.parser]
factory = "parser" factory = "parser"
learn_tokens = false learn_tokens = false
min_action_freq = 1 min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.tagger.model] [components.tagger.model]
@architectures = "spacy.Tagger.v1" @architectures = "spacy.Tagger.v1"
[nlp.pipeline.tagger.model.tok2vec] [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1" @architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width} width = ${components.tok2vec.model:width}
[nlp.pipeline.parser.model] [components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8 nr_feature_tokens = 8
hidden_width = 64 hidden_width = 64
maxout_pieces = 3 maxout_pieces = 3
[nlp.pipeline.parser.model.tok2vec] [components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1" @architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width} width = ${components.tok2vec.model:width}
[nlp.pipeline.tok2vec.model] [components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = ${nlp:vectors} pretrained_vectors = ${training:vectors}
width = 96 width = 96
depth = 4 depth = 4
window_size = 1 window_size = 1

View File

@ -104,7 +104,6 @@ exclude =
.git, .git,
__pycache__, __pycache__,
_tokenizer_exceptions_list.py, _tokenizer_exceptions_list.py,
spacy/__init__.py
[tool:pytest] [tool:pytest]
markers = markers =

View File

@ -1,32 +1,50 @@
from typing import Union, Iterable, Dict, Any
from pathlib import Path
import warnings import warnings
import sys import sys
warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.dtype size changed") # noqa
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # noqa
# These are imported as part of the API # These are imported as part of the API
from thinc.api import prefer_gpu, require_gpu from thinc.api import prefer_gpu, require_gpu # noqa: F401
from . import pipeline from . import pipeline # noqa: F401
from .cli.info import info from .cli.info import info # noqa: F401
from .glossary import explain from .glossary import explain # noqa: F401
from .about import __version__ from .about import __version__ # noqa: F401
from .errors import Errors, Warnings from .util import registry # noqa: F401
from .errors import Errors
from .language import Language
from . import util from . import util
from .util import registry
if sys.maxunicode == 65535: if sys.maxunicode == 65535:
raise SystemError(Errors.E130) raise SystemError(Errors.E130)
config = registry def load(
name: Union[str, Path],
disable: Iterable[str] = tuple(),
component_cfg: Dict[str, Dict[str, Any]] = util.SimpleFrozenDict(),
) -> Language:
"""Load a spaCy model from an installed package or a local path.
name (str): Package name or model path.
disable (Iterable[str]): Names of pipeline components to disable.
component_cfg (Dict[str, dict]): Config overrides for pipeline components,
keyed by component names.
RETURNS (Language): The loaded nlp object.
"""
return util.load_model(name, disable=disable, component_cfg=component_cfg)
def load(name, **overrides): def blank(name: str, **overrides) -> Language:
return util.load_model(name, **overrides) """Create a blank nlp object for a given language code.
name (str): The language code, e.g. "en".
def blank(name, **kwargs): **overrides: Keyword arguments passed to language subclass on init.
RETURNS (Language): The nlp object.
"""
LangClass = util.get_lang_class(name) LangClass = util.get_lang_class(name)
return LangClass(**kwargs) return LangClass(**overrides)

View File

@ -41,7 +41,6 @@ def init_model_cli(
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"), model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)") base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
# fmt: on # fmt: on
): ):
@ -60,7 +59,6 @@ def init_model_cli(
truncate_vectors=truncate_vectors, truncate_vectors=truncate_vectors,
vectors_name=vectors_name, vectors_name=vectors_name,
model_name=model_name, model_name=model_name,
omit_extra_lookups=omit_extra_lookups,
base_model=base_model, base_model=base_model,
silent=False, silent=False,
) )
@ -77,7 +75,6 @@ def init_model(
truncate_vectors: int = 0, truncate_vectors: int = 0,
vectors_name: Optional[str] = None, vectors_name: Optional[str] = None,
model_name: Optional[str] = None, model_name: Optional[str] = None,
omit_extra_lookups: bool = False,
base_model: Optional[str] = None, base_model: Optional[str] = None,
silent: bool = True, silent: bool = True,
) -> Language: ) -> Language:
@ -109,14 +106,6 @@ def init_model(
with msg.loading("Creating model..."): with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed
if omit_extra_lookups:
nlp.vocab.lookups_extra = Lookups()
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
nlp.vocab.lookups_extra.add_table("lexeme_prob")
nlp.vocab.lookups_extra.add_table("lexeme_settings")
msg.good("Successfully created model") msg.good("Successfully created model")
if vectors_loc is not None: if vectors_loc is not None:
add_vectors( add_vectors(

View File

@ -120,14 +120,6 @@ def train(
# Load morph rules # Load morph rules
nlp.vocab.morphology.load_morph_exceptions(morph_rules) nlp.vocab.morphology.load_morph_exceptions(morph_rules)
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed
if config["training"]["omit_extra_lookups"]:
nlp.vocab.lookups_extra = Lookups()
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
nlp.vocab.lookups_extra.add_table("lexeme_prob")
nlp.vocab.lookups_extra.add_table("lexeme_settings")
# Load a pretrained tok2vec model - cf. CLI command 'pretrain' # Load a pretrained tok2vec model - cf. CLI command 'pretrain'
if weights_data is not None: if weights_data is not None:
tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None) tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None)

View File

@ -1,8 +1,7 @@
[nlp] [nlp]
lang = null lang = null
stop_words = []
lex_attr_getters = {}
pipeline = [] pipeline = []
load_vocab_data = true
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1" @tokenizers = "spacy.Tokenizer.v1"
@ -10,11 +9,6 @@ pipeline = []
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1" @lemmatizers = "spacy.Lemmatizer.v1"
[nlp.writing_system]
direction = "ltr"
has_case = true
has_letters = true
[components] [components]
# Training hyper-parameters and additional features. # Training hyper-parameters and additional features.
@ -45,7 +39,6 @@ score_weights = {"tag_acc": 0.2, "dep_las": 0.4, "ents_f": 0.4}
# These settings are invalid for the transformer models. # These settings are invalid for the transformer models.
init_tok2vec = null init_tok2vec = null
discard_oversize = false discard_oversize = false
omit_extra_lookups = false
batch_by = "sequences" batch_by = "sequences"
raw_text = null raw_text = null
tag_map = null tag_map = null

View File

@ -83,7 +83,7 @@ class Warnings:
"doesn't have a normalization table, please ignore this warning. " "doesn't have a normalization table, please ignore this warning. "
"If this is surprising, make sure you have the spacy-lookups-data " "If this is surprising, make sure you have the spacy-lookups-data "
"package installed. The languages with lexeme normalization tables " "package installed. The languages with lexeme normalization tables "
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") "are currently: {langs}")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
@ -434,9 +434,6 @@ class Errors:
E170 = ("Cannot apply transition {name}: invalid for the current state.") E170 = ("Cannot apply transition {name}: invalid for the current state.")
E171 = ("Matcher.add received invalid on_match callback argument: expected " E171 = ("Matcher.add received invalid on_match callback argument: expected "
"callable or None, but got: {arg_type}") "callable or None, but got: {arg_type}")
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
"Lemmatizer, initialize the class directly. See the docs for "
"details: https://spacy.io/api/lemmatizer")
E175 = ("Can't remove rule for unknown match pattern ID: {key}") E175 = ("Can't remove rule for unknown match pattern ID: {key}")
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
E177 = ("Ill-formed IOB input detected: {tag}") E177 = ("Ill-formed IOB input detected: {tag}")
@ -486,6 +483,7 @@ class Errors:
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.")
E956 = ("Can't find component '{name}' in [components] block in the config. " E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}") "Available components: {opts}")
E957 = ("Writing directly to Language.factories isn't needed anymore in " E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -601,7 +599,7 @@ class Errors:
"the same `Vocab`.") "the same `Vocab`.")
E1000 = ("No pkuseg model available. Provide a pkuseg model when " E1000 = ("No pkuseg model available. Provide a pkuseg model when "
"initializing the pipeline:\n" "initializing the pipeline:\n"
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m' 'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
'nlp = Chinese(config=cfg)') 'nlp = Chinese(config=cfg)')

View File

@ -25,8 +25,9 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
lower = True lower = True
if raw is not None: if raw is not None:
raw = raw.lower() raw = raw.lower()
ndsv = nlp.Defaults.single_orth_variants orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
ndpv = nlp.Defaults.paired_orth_variants ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", [])
words = token_dict.get("words", []) words = token_dict.get("words", [])
tags = token_dict.get("tags", []) tags = token_dict.get("tags", [])
# keep unmodified if words or tags are not defined # keep unmodified if words or tags are not defined

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class AfrikaansDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "af"
stop_words = {"@language_data": "spacy.af.stop_words"}
"""
@registry.language_data("spacy.af.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Afrikaans(Language): class Afrikaans(Language):
lang = "af" lang = "af"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = AfrikaansDefaults
__all__ = ["Afrikaans"] __all__ = ["Afrikaans"]

View File

@ -1,48 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "ar"
stop_words = {"@language_data": "spacy.ar.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
"""
@registry.language_data("spacy.ar.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ar.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class ArabicDefaults(Language.Defaults): class ArabicDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Arabic(Language): class Arabic(Language):
lang = "ar"
Defaults = ArabicDefaults Defaults = ArabicDefaults
default_config = Config().from_str(DEFAULT_CONFIG) lang = "ar"
__all__ = ["Arabic"] __all__ = ["Arabic"]

View File

@ -1,4 +1,6 @@
from ...symbols import ORTH, LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {} _exc = {}
@ -6,41 +8,41 @@ _exc = {}
# Time # Time
for exc_data in [ for exc_data in [
{LEMMA: "قبل الميلاد", ORTH: "ق.م"}, {NORM: "قبل الميلاد", ORTH: "ق.م"},
{LEMMA: "بعد الميلاد", ORTH: "ب. م"}, {NORM: "بعد الميلاد", ORTH: "ب. م"},
{LEMMA: "ميلادي", ORTH: ""}, {NORM: "ميلادي", ORTH: ""},
{LEMMA: "هجري", ORTH: ".هـ"}, {NORM: "هجري", ORTH: ".هـ"},
{LEMMA: "توفي", ORTH: ""}, {NORM: "توفي", ORTH: ""},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
# Scientific abv. # Scientific abv.
for exc_data in [ for exc_data in [
{LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"}, {NORM: "صلى الله عليه وسلم", ORTH: "صلعم"},
{LEMMA: "الشارح", ORTH: "الشـ"}, {NORM: "الشارح", ORTH: "الشـ"},
{LEMMA: "الظاهر", ORTH: "الظـ"}, {NORM: "الظاهر", ORTH: "الظـ"},
{LEMMA: "أيضًا", ORTH: "أيضـ"}, {NORM: "أيضًا", ORTH: "أيضـ"},
{LEMMA: "إلى آخره", ORTH: "إلخ"}, {NORM: "إلى آخره", ORTH: "إلخ"},
{LEMMA: "انتهى", ORTH: "اهـ"}, {NORM: "انتهى", ORTH: "اهـ"},
{LEMMA: "حدّثنا", ORTH: "ثنا"}, {NORM: "حدّثنا", ORTH: "ثنا"},
{LEMMA: "حدثني", ORTH: "ثنى"}, {NORM: "حدثني", ORTH: "ثنى"},
{LEMMA: "أنبأنا", ORTH: "أنا"}, {NORM: "أنبأنا", ORTH: "أنا"},
{LEMMA: "أخبرنا", ORTH: "نا"}, {NORM: "أخبرنا", ORTH: "نا"},
{LEMMA: "مصدر سابق", ORTH: "م. س"}, {NORM: "مصدر سابق", ORTH: "م. س"},
{LEMMA: "مصدر نفسه", ORTH: "م. ن"}, {NORM: "مصدر نفسه", ORTH: "م. ن"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
# Other abv. # Other abv.
for exc_data in [ for exc_data in [
{LEMMA: "دكتور", ORTH: "د."}, {NORM: "دكتور", ORTH: "د."},
{LEMMA: "أستاذ دكتور", ORTH: "أ.د"}, {NORM: "أستاذ دكتور", ORTH: "أ.د"},
{LEMMA: "أستاذ", ORTH: "أ."}, {NORM: "أستاذ", ORTH: "أ."},
{LEMMA: "بروفيسور", ORTH: "ب."}, {NORM: "بروفيسور", ORTH: "ب."},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
for exc_data in [{LEMMA: "تلفون", ORTH: "ت."}, {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]: for exc_data in [{NORM: "تلفون", ORTH: "ت."}, {NORM: "صندوق بريد", ORTH: "ص.ب"}]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class BulgarianDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "bg"
stop_words = {"@language_data": "spacy.bg.stop_words"}
"""
@registry.language_data("spacy.bg.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Bulgarian(Language): class Bulgarian(Language):
lang = "bg" lang = "bg"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = BulgarianDefaults
__all__ = ["Bulgarian"] __all__ = ["Bulgarian"]

View File

@ -1,44 +1,20 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "bn"
stop_words = {"@language_data": "spacy.bn.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.bn.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class BengaliDefaults(Language.Defaults): class BengaliDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
class Bengali(Language): class Bengali(Language):
lang = "bn" lang = "bn"
Defaults = BengaliDefaults Defaults = BengaliDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Bengali"] __all__ = ["Bengali"]

View File

@ -1,24 +1,26 @@
from ...symbols import ORTH, LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {} _exc = {}
for exc_data in [ for exc_data in [
{ORTH: "ডঃ", LEMMA: "ডক্টর"}, {ORTH: "ডঃ", NORM: "ডক্টর"},
{ORTH: "ডাঃ", LEMMA: "ডাক্তার"}, {ORTH: "ডাঃ", NORM: "ডাক্তার"},
{ORTH: "ড.", LEMMA: "ডক্টর"}, {ORTH: "ড.", NORM: "ডক্টর"},
{ORTH: "ডা.", LEMMA: "ডাক্তার"}, {ORTH: "ডা.", NORM: "ডাক্তার"},
{ORTH: "মোঃ", LEMMA: "মোহাম্মদ"}, {ORTH: "মোঃ", NORM: "মোহাম্মদ"},
{ORTH: "মো.", LEMMA: "মোহাম্মদ"}, {ORTH: "মো.", NORM: "মোহাম্মদ"},
{ORTH: "সে.", LEMMA: "সেলসিয়াস"}, {ORTH: "সে.", NORM: "সেলসিয়াস"},
{ORTH: "কি.মি.", LEMMA: "কিলোমিটার"}, {ORTH: "কি.মি.", NORM: "কিলোমিটার"},
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"}, {ORTH: "কি.মি", NORM: "কিলোমিটার"},
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, {ORTH: "সে.মি.", NORM: "সেন্টিমিটার"},
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, {ORTH: "সে.মি", NORM: "সেন্টিমিটার"},
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}, {ORTH: "মি.লি.", NORM: "মিলিলিটার"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,49 +1,20 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
from .punctuation import TOKENIZER_INFIXES
DEFAULT_CONFIG = """
[nlp]
lang = "ca"
stop_words = {"@language_data": "spacy.ca.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.ca.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ca.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class CatalanDefaults(Language.Defaults): class CatalanDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Catalan(Language): class Catalan(Language):
lang = "ca" lang = "ca"
Defaults = CatalanDefaults Defaults = CatalanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Catalan"] __all__ = ["Catalan"]

View File

@ -1,38 +1,40 @@
from ...symbols import ORTH, LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {} _exc = {}
for exc_data in [ for exc_data in [
{ORTH: "aprox.", LEMMA: "aproximadament"}, {ORTH: "aprox.", NORM: "aproximadament"},
{ORTH: "pàg.", LEMMA: "pàgina"}, {ORTH: "pàg.", NORM: "pàgina"},
{ORTH: "p.ex.", LEMMA: "per exemple"}, {ORTH: "p.ex.", NORM: "per exemple"},
{ORTH: "gen.", LEMMA: "gener"}, {ORTH: "gen.", NORM: "gener"},
{ORTH: "feb.", LEMMA: "febrer"}, {ORTH: "feb.", NORM: "febrer"},
{ORTH: "abr.", LEMMA: "abril"}, {ORTH: "abr.", NORM: "abril"},
{ORTH: "jul.", LEMMA: "juliol"}, {ORTH: "jul.", NORM: "juliol"},
{ORTH: "set.", LEMMA: "setembre"}, {ORTH: "set.", NORM: "setembre"},
{ORTH: "oct.", LEMMA: "octubre"}, {ORTH: "oct.", NORM: "octubre"},
{ORTH: "nov.", LEMMA: "novembre"}, {ORTH: "nov.", NORM: "novembre"},
{ORTH: "dec.", LEMMA: "desembre"}, {ORTH: "dec.", NORM: "desembre"},
{ORTH: "Dr.", LEMMA: "doctor"}, {ORTH: "Dr.", NORM: "doctor"},
{ORTH: "Sr.", LEMMA: "senyor"}, {ORTH: "Sr.", NORM: "senyor"},
{ORTH: "Sra.", LEMMA: "senyora"}, {ORTH: "Sra.", NORM: "senyora"},
{ORTH: "Srta.", LEMMA: "senyoreta"}, {ORTH: "Srta.", NORM: "senyoreta"},
{ORTH: "núm", LEMMA: "número"}, {ORTH: "núm", NORM: "número"},
{ORTH: "St.", LEMMA: "sant"}, {ORTH: "St.", NORM: "sant"},
{ORTH: "Sta.", LEMMA: "santa"}, {ORTH: "Sta.", NORM: "santa"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
# Times # Times
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}] _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]
for h in range(1, 12 + 1): for h in range(1, 12 + 1):
for period in ["a.m.", "am"]: for period in ["a.m.", "am"]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, NORM: "a.m."}]
for period in ["p.m.", "pm"]: for period in ["p.m.", "pm"]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, NORM: "p.m."}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class CzechDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "cs"
stop_words = {"@language_data": "spacy.cs.stop_words"}
"""
@registry.language_data("spacy.cs.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Czech(Language): class Czech(Language):
lang = "cs" lang = "cs"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = CzechDefaults
__all__ = ["Czech"] __all__ = ["Czech"]

View File

@ -1,50 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "da"
stop_words = {"@language_data": "spacy.da.stop_words"}
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.da.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.da.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class DanishDefaults(Language.Defaults): class DanishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Danish(Language): class Danish(Language):
lang = "da" lang = "da"
Defaults = DanishDefaults Defaults = DanishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Danish"] __all__ = ["Danish"]

View File

@ -2,7 +2,9 @@
Tokenizer Exceptions. Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others. Source: https://forkortelse.dk/ and various others.
""" """
from ...symbols import ORTH, LEMMA, NORM from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {} _exc = {}
@ -11,44 +13,44 @@ _exc = {}
# (for "torsdag") are left out because they are ambiguous. The same is the case # (for "torsdag") are left out because they are ambiguous. The same is the case
# for abbreviations "jul." and "Jul." ("juli"). # for abbreviations "jul." and "Jul." ("juli").
for exc_data in [ for exc_data in [
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, {ORTH: "Kbh.", NORM: "København"},
{ORTH: "jan.", LEMMA: "januar"}, {ORTH: "jan.", NORM: "januar"},
{ORTH: "febr.", LEMMA: "februar"}, {ORTH: "febr.", NORM: "februar"},
{ORTH: "feb.", LEMMA: "februar"}, {ORTH: "feb.", NORM: "februar"},
{ORTH: "mar.", LEMMA: "marts"}, {ORTH: "mar.", NORM: "marts"},
{ORTH: "apr.", LEMMA: "april"}, {ORTH: "apr.", NORM: "april"},
{ORTH: "jun.", LEMMA: "juni"}, {ORTH: "jun.", NORM: "juni"},
{ORTH: "aug.", LEMMA: "august"}, {ORTH: "aug.", NORM: "august"},
{ORTH: "sept.", LEMMA: "september"}, {ORTH: "sept.", NORM: "september"},
{ORTH: "sep.", LEMMA: "september"}, {ORTH: "sep.", NORM: "september"},
{ORTH: "okt.", LEMMA: "oktober"}, {ORTH: "okt.", NORM: "oktober"},
{ORTH: "nov.", LEMMA: "november"}, {ORTH: "nov.", NORM: "november"},
{ORTH: "dec.", LEMMA: "december"}, {ORTH: "dec.", NORM: "december"},
{ORTH: "man.", LEMMA: "mandag"}, {ORTH: "man.", NORM: "mandag"},
{ORTH: "tirs.", LEMMA: "tirsdag"}, {ORTH: "tirs.", NORM: "tirsdag"},
{ORTH: "ons.", LEMMA: "onsdag"}, {ORTH: "ons.", NORM: "onsdag"},
{ORTH: "tor.", LEMMA: "torsdag"}, {ORTH: "tor.", NORM: "torsdag"},
{ORTH: "tors.", LEMMA: "torsdag"}, {ORTH: "tors.", NORM: "torsdag"},
{ORTH: "fre.", LEMMA: "fredag"}, {ORTH: "fre.", NORM: "fredag"},
{ORTH: "lør.", LEMMA: "lørdag"}, {ORTH: "lør.", NORM: "lørdag"},
{ORTH: "Jan.", LEMMA: "januar"}, {ORTH: "Jan.", NORM: "januar"},
{ORTH: "Febr.", LEMMA: "februar"}, {ORTH: "Febr.", NORM: "februar"},
{ORTH: "Feb.", LEMMA: "februar"}, {ORTH: "Feb.", NORM: "februar"},
{ORTH: "Mar.", LEMMA: "marts"}, {ORTH: "Mar.", NORM: "marts"},
{ORTH: "Apr.", LEMMA: "april"}, {ORTH: "Apr.", NORM: "april"},
{ORTH: "Jun.", LEMMA: "juni"}, {ORTH: "Jun.", NORM: "juni"},
{ORTH: "Aug.", LEMMA: "august"}, {ORTH: "Aug.", NORM: "august"},
{ORTH: "Sept.", LEMMA: "september"}, {ORTH: "Sept.", NORM: "september"},
{ORTH: "Sep.", LEMMA: "september"}, {ORTH: "Sep.", NORM: "september"},
{ORTH: "Okt.", LEMMA: "oktober"}, {ORTH: "Okt.", NORM: "oktober"},
{ORTH: "Nov.", LEMMA: "november"}, {ORTH: "Nov.", NORM: "november"},
{ORTH: "Dec.", LEMMA: "december"}, {ORTH: "Dec.", NORM: "december"},
{ORTH: "Man.", LEMMA: "mandag"}, {ORTH: "Man.", NORM: "mandag"},
{ORTH: "Tirs.", LEMMA: "tirsdag"}, {ORTH: "Tirs.", NORM: "tirsdag"},
{ORTH: "Ons.", LEMMA: "onsdag"}, {ORTH: "Ons.", NORM: "onsdag"},
{ORTH: "Fre.", LEMMA: "fredag"}, {ORTH: "Fre.", NORM: "fredag"},
{ORTH: "Lør.", LEMMA: "lørdag"}, {ORTH: "Lør.", NORM: "lørdag"},
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller"}, {ORTH: "og/eller", NORM: "og/eller"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -548,22 +550,22 @@ for orth in [
_exc[capitalized] = [{ORTH: capitalized}] _exc[capitalized] = [{ORTH: capitalized}]
for exc_data in [ for exc_data in [
{ORTH: "s'gu", LEMMA: "s'gu", NORM: "s'gu"}, {ORTH: "s'gu", NORM: "s'gu"},
{ORTH: "S'gu", LEMMA: "s'gu", NORM: "s'gu"}, {ORTH: "S'gu", NORM: "s'gu"},
{ORTH: "sgu'", LEMMA: "s'gu", NORM: "s'gu"}, {ORTH: "sgu'", NORM: "s'gu"},
{ORTH: "Sgu'", LEMMA: "s'gu", NORM: "s'gu"}, {ORTH: "Sgu'", NORM: "s'gu"},
{ORTH: "sku'", LEMMA: "skal", NORM: "skulle"}, {ORTH: "sku'", NORM: "skulle"},
{ORTH: "ku'", LEMMA: "kan", NORM: "kunne"}, {ORTH: "ku'", NORM: "kunne"},
{ORTH: "Ku'", LEMMA: "kan", NORM: "kunne"}, {ORTH: "Ku'", NORM: "kunne"},
{ORTH: "ka'", LEMMA: "kan", NORM: "kan"}, {ORTH: "ka'", NORM: "kan"},
{ORTH: "Ka'", LEMMA: "kan", NORM: "kan"}, {ORTH: "Ka'", NORM: "kan"},
{ORTH: "gi'", LEMMA: "give", NORM: "giv"}, {ORTH: "gi'", NORM: "giv"},
{ORTH: "Gi'", LEMMA: "give", NORM: "giv"}, {ORTH: "Gi'", NORM: "giv"},
{ORTH: "li'", LEMMA: "lide", NORM: "lide"}, {ORTH: "li'", NORM: "lide"},
{ORTH: "ha'", LEMMA: "have", NORM: "have"}, {ORTH: "ha'", NORM: "have"},
{ORTH: "Ha'", LEMMA: "have", NORM: "have"}, {ORTH: "Ha'", NORM: "have"},
{ORTH: "ik'", LEMMA: "ikke", NORM: "ikke"}, {ORTH: "ik'", NORM: "ikke"},
{ORTH: "Ik'", LEMMA: "ikke", NORM: "ikke"}, {ORTH: "Ik'", NORM: "ikke"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -573,7 +575,7 @@ for h in range(1, 31 + 1):
for period in ["."]: for period in ["."]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
_custom_base_exc = {"i.": [{ORTH: "i", LEMMA: "i", NORM: "i"}, {ORTH: "."}]} _custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
_exc.update(_custom_base_exc) _exc.update(_custom_base_exc)
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,61 +1,22 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "de"
stop_words = {"@language_data": "spacy.de.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.de.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class GermanDefaults(Language.Defaults): class GermanDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
single_orth_variants = [ stop_words = STOP_WORDS
{"tags": ["$("], "variants": ["", "..."]},
{"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]},
]
paired_orth_variants = [
{
"tags": ["$("],
"variants": [("'", "'"), (",", "'"), ("", ""), ("", ""), ("", "")],
},
{
"tags": ["$("],
"variants": [("``", "''"), ('"', '"'), ("", ""), ("»", "«"), ("«", "»")],
},
]
class German(Language): class German(Language):
lang = "de" lang = "de"
Defaults = GermanDefaults Defaults = GermanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["German"] __all__ = ["German"]

View File

@ -1,39 +1,26 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
# this iterator extracts spans headed by NOUNs starting from the left-most # this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself for close apposition and # syntactic dependent until the NOUN itself for close apposition and
# measurement construction, the span is sometimes extended to the right of # measurement construction, the span is sometimes extended to the right of
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" # the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
# and not just "eine Tasse", same for "das Thema Familie". # and not just "eine Tasse", same for "das Thema Familie".
labels = [ # fmt: off
"sb", labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
"oa", # fmt: on
"da",
"nk",
"mo",
"ag",
"ROOT",
"root",
"cj",
"pd",
"og",
"app",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
np_deps = set(doc.vocab.strings.add(label) for label in labels) np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings.add("nk") close_app = doc.vocab.strings.add("nk")
rbracket = 0 rbracket = 0
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if i < rbracket: if i < rbracket:

View File

@ -1,157 +1,135 @@
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = { _exc = {
"auf'm": [{ORTH: "auf", LEMMA: "auf"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}], "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
"du's": [ "du's": [{ORTH: "du"}, {ORTH: "'s", NORM: "es"}],
{ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"}, "er's": [{ORTH: "er"}, {ORTH: "'s", NORM: "es"}],
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}, "hinter'm": [{ORTH: "hinter"}, {ORTH: "'m", NORM: "dem"}],
], "ich's": [{ORTH: "ich"}, {ORTH: "'s", NORM: "es"}],
"er's": [ "ihr's": [{ORTH: "ihr"}, {ORTH: "'s", NORM: "es"}],
{ORTH: "er", LEMMA: PRON_LEMMA, TAG: "PPER"}, "sie's": [{ORTH: "sie"}, {ORTH: "'s", NORM: "es"}],
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"}, "unter'm": [{ORTH: "unter"}, {ORTH: "'m", NORM: "dem"}],
], "vor'm": [{ORTH: "vor"}, {ORTH: "'m", NORM: "dem"}],
"hinter'm": [ "wir's": [{ORTH: "wir"}, {ORTH: "'s", NORM: "es"}],
{ORTH: "hinter", LEMMA: "hinter"}, "über'm": [{ORTH: "über"}, {ORTH: "'m", NORM: "dem"}],
{ORTH: "'m", LEMMA: "der", NORM: "dem"},
],
"ich's": [
{ORTH: "ich", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
],
"ihr's": [
{ORTH: "ihr", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
],
"sie's": [
{ORTH: "sie", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
],
"unter'm": [
{ORTH: "unter", LEMMA: "unter"},
{ORTH: "'m", LEMMA: "der", NORM: "dem"},
],
"vor'm": [{ORTH: "vor", LEMMA: "vor"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
"wir's": [
{ORTH: "wir", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER", NORM: "es"},
],
"über'm": [{ORTH: "über", LEMMA: "über"}, {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
} }
for exc_data in [ for exc_data in [
{ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, {ORTH: "'S", NORM: "'s"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, {ORTH: "'s", NORM: "'s"},
{ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, {ORTH: "S'", NORM: "'s"},
{ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"}, {ORTH: "s'", NORM: "'s"},
{ORTH: "'n", LEMMA: "ein", NORM: "ein"}, {ORTH: "'n", NORM: "ein"},
{ORTH: "'ne", LEMMA: "eine", NORM: "eine"}, {ORTH: "'ne", NORM: "eine"},
{ORTH: "'nen", LEMMA: "ein", NORM: "einen"}, {ORTH: "'nen", NORM: "einen"},
{ORTH: "'nem", LEMMA: "ein", NORM: "einem"}, {ORTH: "'nem", NORM: "einem"},
{ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"}, {ORTH: "Abb.", NORM: "Abbildung"},
{ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"}, {ORTH: "Abk.", NORM: "Abkürzung"},
{ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"}, {ORTH: "Abt.", NORM: "Abteilung"},
{ORTH: "Apr.", LEMMA: "April", NORM: "April"}, {ORTH: "Apr.", NORM: "April"},
{ORTH: "Aug.", LEMMA: "August", NORM: "August"}, {ORTH: "Aug.", NORM: "August"},
{ORTH: "Bd.", LEMMA: "Band", NORM: "Band"}, {ORTH: "Bd.", NORM: "Band"},
{ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"}, {ORTH: "Betr.", NORM: "Betreff"},
{ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"}, {ORTH: "Bf.", NORM: "Bahnhof"},
{ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"}, {ORTH: "Bhf.", NORM: "Bahnhof"},
{ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"}, {ORTH: "Bsp.", NORM: "Beispiel"},
{ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"}, {ORTH: "Dez.", NORM: "Dezember"},
{ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"}, {ORTH: "Di.", NORM: "Dienstag"},
{ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"}, {ORTH: "Do.", NORM: "Donnerstag"},
{ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"}, {ORTH: "Fa.", NORM: "Firma"},
{ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"}, {ORTH: "Fam.", NORM: "Familie"},
{ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"}, {ORTH: "Feb.", NORM: "Februar"},
{ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"}, {ORTH: "Fr.", NORM: "Frau"},
{ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"}, {ORTH: "Frl.", NORM: "Fräulein"},
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"}, {ORTH: "Hbf.", NORM: "Hauptbahnhof"},
{ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"}, {ORTH: "Hr.", NORM: "Herr"},
{ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"}, {ORTH: "Hrn.", NORM: "Herrn"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, {ORTH: "Jan.", NORM: "Januar"},
{ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"}, {ORTH: "Jh.", NORM: "Jahrhundert"},
{ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"}, {ORTH: "Jhd.", NORM: "Jahrhundert"},
{ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"}, {ORTH: "Jul.", NORM: "Juli"},
{ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"}, {ORTH: "Jun.", NORM: "Juni"},
{ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"}, {ORTH: "Mi.", NORM: "Mittwoch"},
{ORTH: "Mio.", LEMMA: "Million", NORM: "Million"}, {ORTH: "Mio.", NORM: "Million"},
{ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"}, {ORTH: "Mo.", NORM: "Montag"},
{ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"}, {ORTH: "Mrd.", NORM: "Milliarde"},
{ORTH: "Mrz.", LEMMA: "März", NORM: "März"}, {ORTH: "Mrz.", NORM: "März"},
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"}, {ORTH: "MwSt.", NORM: "Mehrwertsteuer"},
{ORTH: "Mär.", LEMMA: "März", NORM: "März"}, {ORTH: "Mär.", NORM: "März"},
{ORTH: "Nov.", LEMMA: "November", NORM: "November"}, {ORTH: "Nov.", NORM: "November"},
{ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"}, {ORTH: "Nr.", NORM: "Nummer"},
{ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"}, {ORTH: "Okt.", NORM: "Oktober"},
{ORTH: "Orig.", LEMMA: "Original", NORM: "Original"}, {ORTH: "Orig.", NORM: "Original"},
{ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"}, {ORTH: "Pkt.", NORM: "Punkt"},
{ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"}, {ORTH: "Prof.", NORM: "Professor"},
{ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"}, {ORTH: "Red.", NORM: "Redaktion"},
{ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"}, {ORTH: "Sa.", NORM: "Samstag"},
{ORTH: "Sep.", LEMMA: "September", NORM: "September"}, {ORTH: "Sep.", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September", NORM: "September"}, {ORTH: "Sept.", NORM: "September"},
{ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"}, {ORTH: "So.", NORM: "Sonntag"},
{ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"}, {ORTH: "Std.", NORM: "Stunde"},
{ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"}, {ORTH: "Str.", NORM: "Straße"},
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"}, {ORTH: "Tel.", NORM: "Telefon"},
{ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"}, {ORTH: "Tsd.", NORM: "Tausend"},
{ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"}, {ORTH: "Univ.", NORM: "Universität"},
{ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"}, {ORTH: "abzgl.", NORM: "abzüglich"},
{ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"}, {ORTH: "allg.", NORM: "allgemein"},
{ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"}, {ORTH: "bspw.", NORM: "beispielsweise"},
{ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"}, {ORTH: "bzgl.", NORM: "bezüglich"},
{ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"}, {ORTH: "bzw.", NORM: "beziehungsweise"},
{ORTH: "d.h.", LEMMA: "das heißt"}, {ORTH: "d.h."},
{ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"}, {ORTH: "dgl.", NORM: "dergleichen"},
{ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"}, {ORTH: "ebd.", NORM: "ebenda"},
{ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"}, {ORTH: "eigtl.", NORM: "eigentlich"},
{ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"}, {ORTH: "engl.", NORM: "englisch"},
{ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"}, {ORTH: "evtl.", NORM: "eventuell"},
{ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"}, {ORTH: "frz.", NORM: "französisch"},
{ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"}, {ORTH: "gegr.", NORM: "gegründet"},
{ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"}, {ORTH: "ggf.", NORM: "gegebenenfalls"},
{ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"}, {ORTH: "ggfs.", NORM: "gegebenenfalls"},
{ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"}, {ORTH: "ggü.", NORM: "gegenüber"},
{ORTH: "i.O.", LEMMA: "in Ordnung"}, {ORTH: "i.O."},
{ORTH: "i.d.R.", LEMMA: "in der Regel"}, {ORTH: "i.d.R."},
{ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"}, {ORTH: "incl.", NORM: "inklusive"},
{ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"}, {ORTH: "inkl.", NORM: "inklusive"},
{ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"}, {ORTH: "insb.", NORM: "insbesondere"},
{ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"}, {ORTH: "kath.", NORM: "katholisch"},
{ORTH: "lt.", LEMMA: "laut", NORM: "laut"}, {ORTH: "lt.", NORM: "laut"},
{ORTH: "max.", LEMMA: "maximal", NORM: "maximal"}, {ORTH: "max.", NORM: "maximal"},
{ORTH: "min.", LEMMA: "minimal", NORM: "minimal"}, {ORTH: "min.", NORM: "minimal"},
{ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"}, {ORTH: "mind.", NORM: "mindestens"},
{ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"}, {ORTH: "mtl.", NORM: "monatlich"},
{ORTH: "n.Chr.", LEMMA: "nach Christus"}, {ORTH: "n.Chr."},
{ORTH: "orig.", LEMMA: "original", NORM: "original"}, {ORTH: "orig.", NORM: "original"},
{ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"}, {ORTH: "röm.", NORM: "römisch"},
{ORTH: "s.o.", LEMMA: "siehe oben"}, {ORTH: "s.o."},
{ORTH: "sog.", LEMMA: "so genannt"}, {ORTH: "sog."},
{ORTH: "stellv.", LEMMA: "stellvertretend"}, {ORTH: "stellv."},
{ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"}, {ORTH: "tägl.", NORM: "täglich"},
{ORTH: "u.U.", LEMMA: "unter Umständen"}, {ORTH: "u.U."},
{ORTH: "u.s.w.", LEMMA: "und so weiter"}, {ORTH: "u.s.w."},
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"}, {ORTH: "u.v.m."},
{ORTH: "usf.", LEMMA: "und so fort"}, {ORTH: "usf."},
{ORTH: "usw.", LEMMA: "und so weiter"}, {ORTH: "usw."},
{ORTH: "uvm.", LEMMA: "und vieles mehr"}, {ORTH: "uvm."},
{ORTH: "v.Chr.", LEMMA: "vor Christus"}, {ORTH: "v.Chr."},
{ORTH: "v.a.", LEMMA: "vor allem"}, {ORTH: "v.a."},
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}, {ORTH: "v.l.n.r."},
{ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"}, {ORTH: "vgl.", NORM: "vergleiche"},
{ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"}, {ORTH: "vllt.", NORM: "vielleicht"},
{ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"}, {ORTH: "vlt.", NORM: "vielleicht"},
{ORTH: "z.B.", LEMMA: "zum Beispiel"}, {ORTH: "z.B."},
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}, {ORTH: "z.Bsp."},
{ORTH: "z.T.", LEMMA: "zum Teil"}, {ORTH: "z.T."},
{ORTH: "z.Z.", LEMMA: "zur Zeit"}, {ORTH: "z.Z."},
{ORTH: "z.Zt.", LEMMA: "zur Zeit"}, {ORTH: "z.Zt."},
{ORTH: "z.b.", LEMMA: "zum Beispiel"}, {ORTH: "z.b."},
{ORTH: "zzgl.", LEMMA: "zuzüglich"}, {ORTH: "zzgl."},
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}, {ORTH: "österr.", NORM: "österreichisch"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -254,4 +232,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -7,53 +7,44 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...lookups import load_lookups
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "el"
stop_words = {"@language_data": "spacy.el.stop_words"}
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.GreekLemmatizer.v1" @lemmatizers = "spacy.el.GreekLemmatizer"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
""" """
@registry.lemmatizers("spacy.GreekLemmatizer.v1") @registry.lemmatizers("spacy.el.GreekLemmatizer")
def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer: def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]:
return GreekLemmatizer(data_paths=data_paths) tables = ["lemma_index", "lemma_exc", "lemma_rules"]
def lemmatizer_factory(nlp: Language) -> GreekLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return GreekLemmatizer(lookups=lookups)
@registry.language_data("spacy.el.stop_words") return lemmatizer_factory
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.el.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class GreekDefaults(Language.Defaults): class GreekDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
class Greek(Language): class Greek(Language):
lang = "el" lang = "el"
Defaults = GreekDefaults Defaults = GreekDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Greek"] __all__ = ["Greek"]

View File

@ -1,21 +1,20 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases. Works on both Doc and Span.
"""
# It follows the logic of the noun chunks finder of English language, # It follows the logic of the noun chunks finder of English language,
# adjusted to some Greek language special characteristics. # adjusted to some Greek language special characteristics.
# obj tag corrects some DEP tagger mistakes. # obj tag corrects some DEP tagger mistakes.
# Further improvement of the models will eliminate the need for this tag. # Further improvement of the models will eliminate the need for this tag.
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"] labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
nmod = doc.vocab.strings.add("nmod") nmod = doc.vocab.strings.add("nmod")

File diff suppressed because it is too large Load Diff

View File

@ -1,129 +1,128 @@
from ...symbols import ORTH, LEMMA, NORM from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {} _exc = {}
for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]: for token in ["Απ'", "ΑΠ'", "αφ'", "Αφ'"]:
_exc[token] = [{ORTH: token, LEMMA: "από", NORM: "από"}] _exc[token] = [{ORTH: token, NORM: "από"}]
for token in ["Αλλ'", "αλλ'"]: for token in ["Αλλ'", "αλλ'"]:
_exc[token] = [{ORTH: token, LEMMA: "αλλά", NORM: "αλλά"}] _exc[token] = [{ORTH: token, NORM: "αλλά"}]
for token in ["παρ'", "Παρ'", "ΠΑΡ'"]: for token in ["παρ'", "Παρ'", "ΠΑΡ'"]:
_exc[token] = [{ORTH: token, LEMMA: "παρά", NORM: "παρά"}] _exc[token] = [{ORTH: token, NORM: "παρά"}]
for token in ["καθ'", "Καθ'"]: for token in ["καθ'", "Καθ'"]:
_exc[token] = [{ORTH: token, LEMMA: "κάθε", NORM: "κάθε"}] _exc[token] = [{ORTH: token, NORM: "κάθε"}]
for token in ["κατ'", "Κατ'"]: for token in ["κατ'", "Κατ'"]:
_exc[token] = [{ORTH: token, LEMMA: "κατά", NORM: "κατά"}] _exc[token] = [{ORTH: token, NORM: "κατά"}]
for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]: for token in ["'ΣΟΥΝ", "'ναι", "'ταν", "'τανε", "'μαστε", "'μουνα", "'μουν"]:
_exc[token] = [{ORTH: token, LEMMA: "είμαι", NORM: "είμαι"}] _exc[token] = [{ORTH: token, NORM: "είμαι"}]
for token in ["Επ'", "επ'", "εφ'", "Εφ'"]: for token in ["Επ'", "επ'", "εφ'", "Εφ'"]:
_exc[token] = [{ORTH: token, LEMMA: "επί", NORM: "επί"}] _exc[token] = [{ORTH: token, NORM: "επί"}]
for token in ["Δι'", "δι'"]: for token in ["Δι'", "δι'"]:
_exc[token] = [{ORTH: token, LEMMA: "δια", NORM: "δια"}] _exc[token] = [{ORTH: token, NORM: "δια"}]
for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]: for token in ["'χουν", "'χουμε", "'χαμε", "'χα", "'χε", "'χεις", "'χει"]:
_exc[token] = [{ORTH: token, LEMMA: "έχω", NORM: "έχω"}] _exc[token] = [{ORTH: token, NORM: "έχω"}]
for token in ["υπ'", "Υπ'"]: for token in ["υπ'", "Υπ'"]:
_exc[token] = [{ORTH: token, LEMMA: "υπό", NORM: "υπό"}] _exc[token] = [{ORTH: token, NORM: "υπό"}]
for token in ["Μετ'", "ΜΕΤ'", "'μετ"]: for token in ["Μετ'", "ΜΕΤ'", "'μετ"]:
_exc[token] = [{ORTH: token, LEMMA: "μετά", NORM: "μετά"}] _exc[token] = [{ORTH: token, NORM: "μετά"}]
for token in ["Μ'", "μ'"]: for token in ["Μ'", "μ'"]:
_exc[token] = [{ORTH: token, LEMMA: "με", NORM: "με"}] _exc[token] = [{ORTH: token, NORM: "με"}]
for token in ["Γι'", "ΓΙ'", "γι'"]: for token in ["Γι'", "ΓΙ'", "γι'"]:
_exc[token] = [{ORTH: token, LEMMA: "για", NORM: "για"}] _exc[token] = [{ORTH: token, NORM: "για"}]
for token in ["Σ'", "σ'"]: for token in ["Σ'", "σ'"]:
_exc[token] = [{ORTH: token, LEMMA: "σε", NORM: "σε"}] _exc[token] = [{ORTH: token, NORM: "σε"}]
for token in ["Θ'", "θ'"]: for token in ["Θ'", "θ'"]:
_exc[token] = [{ORTH: token, LEMMA: "θα", NORM: "θα"}] _exc[token] = [{ORTH: token, NORM: "θα"}]
for token in ["Ν'", "ν'"]: for token in ["Ν'", "ν'"]:
_exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}] _exc[token] = [{ORTH: token, NORM: "να"}]
for token in ["Τ'", "τ'"]: for token in ["Τ'", "τ'"]:
_exc[token] = [{ORTH: token, LEMMA: "να", NORM: "να"}] _exc[token] = [{ORTH: token, NORM: "να"}]
for token in ["'γω", "'σένα", "'μεις"]: for token in ["'γω", "'σένα", "'μεις"]:
_exc[token] = [{ORTH: token, LEMMA: "εγώ", NORM: "εγώ"}] _exc[token] = [{ORTH: token, NORM: "εγώ"}]
for token in ["Τ'", "τ'"]: for token in ["Τ'", "τ'"]:
_exc[token] = [{ORTH: token, LEMMA: "το", NORM: "το"}] _exc[token] = [{ORTH: token, NORM: "το"}]
for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]: for token in ["Φέρ'", "Φερ'", "φέρ'", "φερ'"]:
_exc[token] = [{ORTH: token, LEMMA: "φέρνω", NORM: "φέρνω"}] _exc[token] = [{ORTH: token, NORM: "φέρνω"}]
for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]: for token in ["'ρθούνε", "'ρθουν", "'ρθει", "'ρθεί", "'ρθε", "'ρχεται"]:
_exc[token] = [{ORTH: token, LEMMA: "έρχομαι", NORM: "έρχομαι"}] _exc[token] = [{ORTH: token, NORM: "έρχομαι"}]
for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]: for token in ["'πανε", "'λεγε", "'λεγαν", "'πε", "'λεγα"]:
_exc[token] = [{ORTH: token, LEMMA: "λέγω", NORM: "λέγω"}] _exc[token] = [{ORTH: token, NORM: "λέγω"}]
for token in ["Πάρ'", "πάρ'"]: for token in ["Πάρ'", "πάρ'"]:
_exc[token] = [{ORTH: token, LEMMA: "παίρνω", NORM: "παίρνω"}] _exc[token] = [{ORTH: token, NORM: "παίρνω"}]
for token in ["μέσ'", "Μέσ'", "μεσ'"]: for token in ["μέσ'", "Μέσ'", "μεσ'"]:
_exc[token] = [{ORTH: token, LEMMA: "μέσα", NORM: "μέσα"}] _exc[token] = [{ORTH: token, NORM: "μέσα"}]
for token in ["Δέσ'", "Δεσ'", "δεσ'"]: for token in ["Δέσ'", "Δεσ'", "δεσ'"]:
_exc[token] = [{ORTH: token, LEMMA: "δένω", NORM: "δένω"}] _exc[token] = [{ORTH: token, NORM: "δένω"}]
for token in ["'κανε", "Κάν'"]: for token in ["'κανε", "Κάν'"]:
_exc[token] = [{ORTH: token, LEMMA: "κάνω", NORM: "κάνω"}] _exc[token] = [{ORTH: token, NORM: "κάνω"}]
_other_exc = { _other_exc = {
"κι": [{ORTH: "κι", LEMMA: "και", NORM: "και"}], "κι": [{ORTH: "κι", NORM: "και"}],
"Παίξ'": [{ORTH: "Παίξ'", LEMMA: "παίζω", NORM: "παίζω"}], "Παίξ'": [{ORTH: "Παίξ'", NORM: "παίζω"}],
"Αντ'": [{ORTH: "Αντ'", LEMMA: "αντί", NORM: "αντί"}], "Αντ'": [{ORTH: "Αντ'", NORM: "αντί"}],
"ολ'": [{ORTH: "ολ'", LEMMA: "όλος", NORM: "όλος"}], "ολ'": [{ORTH: "ολ'", NORM: "όλος"}],
"ύστερ'": [{ORTH: "ύστερ'", LEMMA: "ύστερα", NORM: "ύστερα"}], "ύστερ'": [{ORTH: "ύστερ'", NORM: "ύστερα"}],
"'πρεπε": [{ORTH: "'πρεπε", LEMMA: "πρέπει", NORM: "πρέπει"}], "'πρεπε": [{ORTH: "'πρεπε", NORM: "πρέπει"}],
"Δύσκολ'": [{ORTH: "Δύσκολ'", LEMMA: "δύσκολος", NORM: "δύσκολος"}], "Δύσκολ'": [{ORTH: "Δύσκολ'", NORM: "δύσκολος"}],
"'θελα": [{ORTH: "'θελα", LEMMA: "θέλω", NORM: "θέλω"}], "'θελα": [{ORTH: "'θελα", NORM: "θέλω"}],
"'γραφα": [{ORTH: "'γραφα", LEMMA: "γράφω", NORM: "γράφω"}], "'γραφα": [{ORTH: "'γραφα", NORM: "γράφω"}],
"'παιρνα": [{ORTH: "'παιρνα", LEMMA: "παίρνω", NORM: "παίρνω"}], "'παιρνα": [{ORTH: "'παιρνα", NORM: "παίρνω"}],
"'δειξε": [{ORTH: "'δειξε", LEMMA: "δείχνω", NORM: "δείχνω"}], "'δειξε": [{ORTH: "'δειξε", NORM: "δείχνω"}],
"όμουρφ'": [{ORTH: "όμουρφ'", LEMMA: "όμορφος", NORM: "όμορφος"}], "όμουρφ'": [{ORTH: "όμουρφ'", NORM: "όμορφος"}],
"κ'τσή": [{ORTH: "κ'τσή", LEMMA: "κουτσός", NORM: "κουτσός"}], "κ'τσή": [{ORTH: "κ'τσή", NORM: "κουτσός"}],
"μηδ'": [{ORTH: "μηδ'", LEMMA: "μήδε", NORM: "μήδε"}], "μηδ'": [{ORTH: "μηδ'", NORM: "μήδε"}],
"'ξομολογήθηκε": [ "'ξομολογήθηκε": [{ORTH: "'ξομολογήθηκε", NORM: "εξομολογούμαι"}],
{ORTH: "'ξομολογήθηκε", LEMMA: "εξομολογούμαι", NORM: "εξομολογούμαι"} "'μας": [{ORTH: "'μας", NORM: "εμάς"}],
], "'ξερες": [{ORTH: "'ξερες", NORM: "ξέρω"}],
"'μας": [{ORTH: "'μας", LEMMA: "εμάς", NORM: "εμάς"}], "έφθασ'": [{ORTH: "έφθασ'", NORM: "φθάνω"}],
"'ξερες": [{ORTH: "'ξερες", LEMMA: "ξέρω", NORM: "ξέρω"}], "εξ'": [{ORTH: "εξ'", NORM: "εκ"}],
"έφθασ'": [{ORTH: "έφθασ'", LEMMA: "φθάνω", NORM: "φθάνω"}], "δώσ'": [{ORTH: "δώσ'", NORM: "δίνω"}],
"εξ'": [{ORTH: "εξ'", LEMMA: "εκ", NORM: "εκ"}], "τίποτ'": [{ORTH: "τίποτ'", NORM: "τίποτα"}],
"δώσ'": [{ORTH: "δώσ'", LEMMA: "δίνω", NORM: "δίνω"}], "Λήξ'": [{ORTH: "Λήξ'", NORM: "λήγω"}],
"τίποτ'": [{ORTH: "τίποτ'", LEMMA: "τίποτα", NORM: "τίποτα"}], "άσ'": [{ORTH: "άσ'", NORM: "αφήνω"}],
"Λήξ'": [{ORTH: "Λήξ'", LEMMA: "λήγω", NORM: "λήγω"}], "Στ'": [{ORTH: "Στ'", NORM: "στο"}],
"άσ'": [{ORTH: "άσ'", LEMMA: "αφήνω", NORM: "αφήνω"}], "Δωσ'": [{ORTH: "Δωσ'", NORM: "δίνω"}],
"Στ'": [{ORTH: "Στ'", LEMMA: "στο", NORM: "στο"}], "Βάψ'": [{ORTH: "Βάψ'", NORM: "βάφω"}],
"Δωσ'": [{ORTH: "Δωσ'", LEMMA: "δίνω", NORM: "δίνω"}], "Αλλ'": [{ORTH: "Αλλ'", NORM: "αλλά"}],
"Βάψ'": [{ORTH: "Βάψ'", LEMMA: "βάφω", NORM: "βάφω"}], "Αμ'": [{ORTH: "Αμ'", NORM: "άμα"}],
"Αλλ'": [{ORTH: "Αλλ'", LEMMA: "αλλά", NORM: "αλλά"}], "Αγόρασ'": [{ORTH: "Αγόρασ'", NORM: "αγοράζω"}],
"Αμ'": [{ORTH: "Αμ'", LEMMA: "άμα", NORM: "άμα"}], "'φύγε": [{ORTH: "'φύγε", NORM: "φεύγω"}],
"Αγόρασ'": [{ORTH: "Αγόρασ'", LEMMA: "αγοράζω", NORM: "αγοράζω"}], "'φερε": [{ORTH: "'φερε", NORM: "φέρνω"}],
"'φύγε": [{ORTH: "'φύγε", LEMMA: "φεύγω", NORM: "φεύγω"}], "'φαγε": [{ORTH: "'φαγε", NORM: "τρώω"}],
"'φερε": [{ORTH: "'φερε", LEMMA: "φέρνω", NORM: "φέρνω"}], "'σπαγαν": [{ORTH: "'σπαγαν", NORM: "σπάω"}],
"'φαγε": [{ORTH: "'φαγε", LEMMA: "τρώω", NORM: "τρώω"}], "'σκασε": [{ORTH: "'σκασε", NORM: "σκάω"}],
"'σπαγαν": [{ORTH: "'σπαγαν", LEMMA: "σπάω", NORM: "σπάω"}], "'σβηνε": [{ORTH: "'σβηνε", NORM: "σβήνω"}],
"'σκασε": [{ORTH: "'σκασε", LEMMA: "σκάω", NORM: "σκάω"}], "'ριξε": [{ORTH: "'ριξε", NORM: "ρίχνω"}],
"'σβηνε": [{ORTH: "'σβηνε", LEMMA: "σβήνω", NORM: "σβήνω"}], "'κλεβε": [{ORTH: "'κλεβε", NORM: "κλέβω"}],
"'ριξε": [{ORTH: "'ριξε", LEMMA: "ρίχνω", NORM: "ρίχνω"}], "'κει": [{ORTH: "'κει", NORM: "εκεί"}],
"'κλεβε": [{ORTH: "'κλεβε", LEMMA: "κλέβω", NORM: "κλέβω"}], "'βλεπε": [{ORTH: "'βλεπε", NORM: "βλέπω"}],
"'κει": [{ORTH: "'κει", LEMMA: "εκεί", NORM: "εκεί"}], "'βγαινε": [{ORTH: "'βγαινε", NORM: "βγαίνω"}],
"'βλεπε": [{ORTH: "'βλεπε", LEMMA: "βλέπω", NORM: "βλέπω"}],
"'βγαινε": [{ORTH: "'βγαινε", LEMMA: "βγαίνω", NORM: "βγαίνω"}],
} }
_exc.update(_other_exc) _exc.update(_other_exc)
@ -133,35 +132,35 @@ for h in range(1, 12 + 1):
for period in ["π.μ.", "πμ"]: for period in ["π.μ.", "πμ"]:
_exc[f"{h}{period}"] = [ _exc[f"{h}{period}"] = [
{ORTH: f"{h}"}, {ORTH: f"{h}"},
{ORTH: period, LEMMA: "π.μ.", NORM: "π.μ."}, {ORTH: period, NORM: "π.μ."},
] ]
for period in ["μ.μ.", "μμ"]: for period in ["μ.μ.", "μμ"]:
_exc[f"{h}{period}"] = [ _exc[f"{h}{period}"] = [
{ORTH: f"{h}"}, {ORTH: f"{h}"},
{ORTH: period, LEMMA: "μ.μ.", NORM: "μ.μ."}, {ORTH: period, NORM: "μ.μ."},
] ]
for exc_data in [ for exc_data in [
{ORTH: "ΑΓΡ.", LEMMA: "Αγροτικός", NORM: "Αγροτικός"}, {ORTH: "ΑΓΡ.", NORM: "Αγροτικός"},
{ORTH: "Αγ. Γρ.", LEMMA: "Αγία Γραφή", NORM: "Αγία Γραφή"}, {ORTH: "Αγ. Γρ.", NORM: "Αγία Γραφή"},
{ORTH: "Αθ.", LEMMA: "Αθανάσιος", NORM: "Αθανάσιος"}, {ORTH: "Αθ.", NORM: "Αθανάσιος"},
{ORTH: "Αλεξ.", LEMMA: "Αλέξανδρος", NORM: "Αλέξανδρος"}, {ORTH: "Αλεξ.", NORM: "Αλέξανδρος"},
{ORTH: "Απρ.", LEMMA: "Απρίλιος", NORM: "Απρίλιος"}, {ORTH: "Απρ.", NORM: "Απρίλιος"},
{ORTH: "Αύγ.", LEMMA: "Αύγουστος", NORM: "Αύγουστος"}, {ORTH: "Αύγ.", NORM: "Αύγουστος"},
{ORTH: "Δεκ.", LEMMA: "Δεκέμβριος", NORM: "Δεκέμβριος"}, {ORTH: "Δεκ.", NORM: "Δεκέμβριος"},
{ORTH: "Δημ.", LEMMA: "Δήμος", NORM: "Δήμος"}, {ORTH: "Δημ.", NORM: "Δήμος"},
{ORTH: "Ιαν.", LEMMA: "Ιανουάριος", NORM: "Ιανουάριος"}, {ORTH: "Ιαν.", NORM: "Ιανουάριος"},
{ORTH: "Ιούλ.", LEMMA: "Ιούλιος", NORM: "Ιούλιος"}, {ORTH: "Ιούλ.", NORM: "Ιούλιος"},
{ORTH: "Ιούν.", LEMMA: "Ιούνιος", NORM: "Ιούνιος"}, {ORTH: "Ιούν.", NORM: "Ιούνιος"},
{ORTH: "Ιωαν.", LEMMA: "Ιωάννης", NORM: "Ιωάννης"}, {ORTH: "Ιωαν.", NORM: "Ιωάννης"},
{ORTH: "Μ. Ασία", LEMMA: "Μικρά Ασία", NORM: "Μικρά Ασία"}, {ORTH: "Μ. Ασία", NORM: "Μικρά Ασία"},
{ORTH: "Μάρτ.", LEMMA: "Μάρτιος", NORM: "Μάρτιος"}, {ORTH: "Μάρτ.", NORM: "Μάρτιος"},
{ORTH: "Μάρτ'", LEMMA: "Μάρτιος", NORM: "Μάρτιος"}, {ORTH: "Μάρτ'", NORM: "Μάρτιος"},
{ORTH: "Νοέμβρ.", LEMMA: "Νοέμβριος", NORM: "Νοέμβριος"}, {ORTH: "Νοέμβρ.", NORM: "Νοέμβριος"},
{ORTH: "Οκτ.", LEMMA: "Οκτώβριος", NORM: "Οκτώβριος"}, {ORTH: "Οκτ.", NORM: "Οκτώβριος"},
{ORTH: "Σεπτ.", LEMMA: "Σεπτέμβριος", NORM: "Σεπτέμβριος"}, {ORTH: "Σεπτ.", NORM: "Σεπτέμβριος"},
{ORTH: "Φεβρ.", LEMMA: "Φεβρουάριος", NORM: "Φεβρουάριος"}, {ORTH: "Φεβρ.", NORM: "Φεβρουάριος"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -392,4 +391,4 @@ for orth in [
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -7,60 +7,43 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import is_base_form from .lemmatizer import is_base_form
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...lemmatizer import Lemmatizer from ...lemmatizer import Lemmatizer
from ...util import update_exc, registry from ...lookups import load_lookups
from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "en"
stop_words = {"@language_data": "spacy.en.stop_words"}
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.EnglishLemmatizer.v1" @lemmatizers = "spacy.en.EnglishLemmatizer"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
""" """
@registry.language_data("spacy.en.stop_words") @registry.lemmatizers("spacy.en.EnglishLemmatizer")
def stop_words() -> Set[str]: def create_lemmatizer() -> Callable[[Language], Lemmatizer]:
return STOP_WORDS tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
def lemmatizer_factory(nlp: Language) -> Lemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return Lemmatizer(lookups=lookups, is_base_form=is_base_form)
@registry.language_data("spacy.en.lex_attr_getters") return lemmatizer_factory
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) config = Config().from_str(DEFAULT_CONFIG)
syntax_iterators = SYNTAX_ITERATORS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
single_orth_variants = [ lex_attr_getters = LEX_ATTRS
{"tags": ["NFP"], "variants": ["", "..."]}, syntax_iterators = SYNTAX_ITERATORS
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]}, stop_words = STOP_WORDS
]
paired_orth_variants = [
{"tags": ["``", "''"], "variants": [("'", "'"), ("", "")]},
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]},
]
class English(Language): class English(Language):
lang = "en" lang = "en"
Defaults = EnglishDefaults Defaults = EnglishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["English"] __all__ = ["English"]

View File

@ -1,5 +1,5 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
_infixes = ( _infixes = (
LIST_ELLIPSES LIST_ELLIPSES

View File

@ -1,27 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span. # fmt: off
""" labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
labels = [ # fmt: on
"nsubj",
"dobj",
"nsubjpass",
"pcomp",
"pobj",
"dative",
"appos",
"attr",
"ROOT",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")

View File

@ -1,4 +1,6 @@
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {} _exc = {}
@ -26,110 +28,110 @@ _exclude = [
for pron in ["i"]: for pron in ["i"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [ _exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"}, {ORTH: "'m", NORM: "am"},
] ]
_exc[orth + "m"] = [ _exc[orth + "m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}, {ORTH: "m", "tenspect": 1, "number": 1},
] ]
_exc[orth + "'ma"] = [ _exc[orth + "'ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'m", LEMMA: "be", NORM: "am"}, {ORTH: "'m", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}, {ORTH: "a", NORM: "gonna"},
] ]
_exc[orth + "ma"] = [ _exc[orth + "ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "m", LEMMA: "be", NORM: "am"}, {ORTH: "m", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}, {ORTH: "a", NORM: "gonna"},
] ]
for pron in ["i", "you", "he", "she", "it", "we", "they"]: for pron in ["i", "you", "he", "she", "it", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "'ll", NORM: "will"},
] ]
_exc[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "ll", NORM: "will"},
] ]
_exc[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "'ll", NORM: "will"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "ll", NORM: "will"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'d", NORM: "'d"}, {ORTH: "'d", NORM: "'d"},
] ]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "d", NORM: "'d"}, {ORTH: "d", NORM: "'d"},
] ]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, {ORTH: "'d", NORM: "would"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, {ORTH: "d", NORM: "would"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
for pron in ["i", "you", "we", "they"]: for pron in ["i", "you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
for pron in ["you", "we", "they"]: for pron in ["you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'re", LEMMA: "be", NORM: "are"}, {ORTH: "'re", NORM: "are"},
] ]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}, {ORTH: "re", NORM: "are"},
] ]
for pron in ["he", "she", "it"]: for pron in ["he", "she", "it"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "'s", NORM: "'s"}, {ORTH: "'s", NORM: "'s"},
] ]
_exc[orth + "s"] = [ _exc[orth + "s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, NORM: pron},
{ORTH: "s"}, {ORTH: "s"},
] ]
@ -151,145 +153,145 @@ for word in [
]: ]:
for orth in [word, word.title()]: for orth in [word, word.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'s", NORM: "'s"}, {ORTH: "'s", NORM: "'s"},
] ]
_exc[orth + "s"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "s"}] _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "'ll", NORM: "will"},
] ]
_exc[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "ll", NORM: "will"},
] ]
_exc[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "'ll", NORM: "will"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "ll", NORM: "will"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'re", LEMMA: "be", NORM: "are"}, {ORTH: "'re", NORM: "are"},
] ]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "re", LEMMA: "be", NORM: "are"}, {ORTH: "re", NORM: "are"},
] ]
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}, {ORTH: "'ve"},
] ]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'d", NORM: "'d"}, {ORTH: "'d", NORM: "'d"},
] ]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "d", NORM: "'d"}, {ORTH: "d", NORM: "'d"},
] ]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, {ORTH: "'d", NORM: "would"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, NORM: word},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, {ORTH: "d", NORM: "would"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
# Verbs # Verbs
for verb_data in [ for verb_data in [
{ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"}, {ORTH: "ca", NORM: "can"},
{ORTH: "could", NORM: "could", TAG: "MD"}, {ORTH: "could", NORM: "could"},
{ORTH: "do", LEMMA: "do", NORM: "do"}, {ORTH: "do", NORM: "do"},
{ORTH: "does", LEMMA: "do", NORM: "does"}, {ORTH: "does", NORM: "does"},
{ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"}, {ORTH: "did", NORM: "do"},
{ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"}, {ORTH: "had", NORM: "have"},
{ORTH: "may", NORM: "may", TAG: "MD"}, {ORTH: "may", NORM: "may"},
{ORTH: "might", NORM: "might", TAG: "MD"}, {ORTH: "might", NORM: "might"},
{ORTH: "must", NORM: "must", TAG: "MD"}, {ORTH: "must", NORM: "must"},
{ORTH: "need", NORM: "need"}, {ORTH: "need", NORM: "need"},
{ORTH: "ought", NORM: "ought", TAG: "MD"}, {ORTH: "ought", NORM: "ought"},
{ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"}, {ORTH: "sha", NORM: "shall"},
{ORTH: "should", NORM: "should", TAG: "MD"}, {ORTH: "should", NORM: "should"},
{ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"}, {ORTH: "wo", NORM: "will"},
{ORTH: "would", NORM: "would", TAG: "MD"}, {ORTH: "would", NORM: "would"},
]: ]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "n't", NORM: "not"},
] ]
_exc[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "nt", NORM: "not"},
] ]
_exc[data[ORTH] + "n't've"] = [ _exc[data[ORTH] + "n't've"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "n't", NORM: "not"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "'ve", NORM: "have"},
] ]
_exc[data[ORTH] + "ntve"] = [ _exc[data[ORTH] + "ntve"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "nt", NORM: "not"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}, {ORTH: "ve", NORM: "have"},
] ]
for verb_data in [ for verb_data in [
{ORTH: "could", NORM: "could", TAG: "MD"}, {ORTH: "could", NORM: "could"},
{ORTH: "might", NORM: "might", TAG: "MD"}, {ORTH: "might", NORM: "might"},
{ORTH: "must", NORM: "must", TAG: "MD"}, {ORTH: "must", NORM: "must"},
{ORTH: "should", NORM: "should", TAG: "MD"}, {ORTH: "should", NORM: "should"},
{ORTH: "would", NORM: "would", TAG: "MD"}, {ORTH: "would", NORM: "would"},
]: ]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "'ve"] = [dict(data), {ORTH: "'ve", LEMMA: "have", TAG: "VB"}] _exc[data[ORTH] + "'ve"] = [dict(data), {ORTH: "'ve"}]
_exc[data[ORTH] + "ve"] = [dict(data), {ORTH: "ve", LEMMA: "have", TAG: "VB"}] _exc[data[ORTH] + "ve"] = [dict(data), {ORTH: "ve"}]
for verb_data in [ for verb_data in [
{ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2}, {ORTH: "ai", "number": 2},
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2}, {ORTH: "are", NORM: "are", "number": 2},
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"}, {ORTH: "is", NORM: "is"},
{ORTH: "was", LEMMA: "be", NORM: "was"}, {ORTH: "was", NORM: "was"},
{ORTH: "were", LEMMA: "be", NORM: "were"}, {ORTH: "were", NORM: "were"},
{ORTH: "have", NORM: "have"}, {ORTH: "have", NORM: "have"},
{ORTH: "has", LEMMA: "have", NORM: "has"}, {ORTH: "has", NORM: "has"},
{ORTH: "dare", NORM: "dare"}, {ORTH: "dare", NORM: "dare"},
]: ]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
@ -297,24 +299,24 @@ for verb_data in [
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "n't", NORM: "not"},
] ]
_exc[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}, {ORTH: "nt", NORM: "not"},
] ]
# Other contractions with trailing apostrophe # Other contractions with trailing apostrophe
for exc_data in [ for exc_data in [
{ORTH: "doin", LEMMA: "do", NORM: "doing"}, {ORTH: "doin", NORM: "doing"},
{ORTH: "goin", LEMMA: "go", NORM: "going"}, {ORTH: "goin", NORM: "going"},
{ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"}, {ORTH: "nothin", NORM: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"}, {ORTH: "nuthin", NORM: "nothing"},
{ORTH: "ol", LEMMA: "old", NORM: "old"}, {ORTH: "ol", NORM: "old"},
{ORTH: "somethin", LEMMA: "something", NORM: "something"}, {ORTH: "somethin", NORM: "something"},
]: ]:
exc_data_tc = dict(exc_data) exc_data_tc = dict(exc_data)
exc_data_tc[ORTH] = exc_data_tc[ORTH].title() exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
@ -329,9 +331,9 @@ for exc_data in [
for exc_data in [ for exc_data in [
{ORTH: "cause", NORM: "because"}, {ORTH: "cause", NORM: "because"},
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, {ORTH: "em", NORM: "them"},
{ORTH: "ll", LEMMA: "will", NORM: "will"}, {ORTH: "ll", NORM: "will"},
{ORTH: "nuff", LEMMA: "enough", NORM: "enough"}, {ORTH: "nuff", NORM: "enough"},
]: ]:
exc_data_apos = dict(exc_data) exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
@ -345,166 +347,131 @@ for h in range(1, 12 + 1):
for period in ["a.m.", "am"]: for period in ["a.m.", "am"]:
_exc[f"{h}{period}"] = [ _exc[f"{h}{period}"] = [
{ORTH: f"{h}"}, {ORTH: f"{h}"},
{ORTH: period, LEMMA: "a.m.", NORM: "a.m."}, {ORTH: period, NORM: "a.m."},
] ]
for period in ["p.m.", "pm"]: for period in ["p.m.", "pm"]:
_exc[f"{h}{period}"] = [ _exc[f"{h}{period}"] = [
{ORTH: f"{h}"}, {ORTH: f"{h}"},
{ORTH: period, LEMMA: "p.m.", NORM: "p.m."}, {ORTH: period, NORM: "p.m."},
] ]
# Rest # Rest
_other_exc = { _other_exc = {
"y'all": [{ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "all"}], "y'all": [{ORTH: "y'", NORM: "you"}, {ORTH: "all"}],
"yall": [{ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, {ORTH: "all"}], "yall": [{ORTH: "y", NORM: "you"}, {ORTH: "all"}],
"how'd'y": [ "how'd'y": [{ORTH: "how"}, {ORTH: "'d"}, {ORTH: "'y", NORM: "you"}],
{ORTH: "how", LEMMA: "how"}, "How'd'y": [{ORTH: "How", NORM: "how"}, {ORTH: "'d"}, {ORTH: "'y", NORM: "you"}],
{ORTH: "'d", LEMMA: "do"}, "not've": [{ORTH: "not"}, {ORTH: "'ve", NORM: "have"}],
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}, "notve": [{ORTH: "not"}, {ORTH: "ve", NORM: "have"}],
], "Not've": [{ORTH: "Not", NORM: "not"}, {ORTH: "'ve", NORM: "have"}],
"How'd'y": [ "Notve": [{ORTH: "Not", NORM: "not"}, {ORTH: "ve", NORM: "have"}],
{ORTH: "How", LEMMA: "how", NORM: "how"}, "cannot": [{ORTH: "can"}, {ORTH: "not"}],
{ORTH: "'d", LEMMA: "do"}, "Cannot": [{ORTH: "Can", NORM: "can"}, {ORTH: "not"}],
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}, "gonna": [{ORTH: "gon", NORM: "going"}, {ORTH: "na", NORM: "to"}],
], "Gonna": [{ORTH: "Gon", NORM: "going"}, {ORTH: "na", NORM: "to"}],
"not've": [ "gotta": [{ORTH: "got"}, {ORTH: "ta", NORM: "to"}],
{ORTH: "not", LEMMA: "not", TAG: "RB"}, "Gotta": [{ORTH: "Got", NORM: "got"}, {ORTH: "ta", NORM: "to"}],
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}, "let's": [{ORTH: "let"}, {ORTH: "'s", NORM: "us"}],
], "Let's": [{ORTH: "Let", NORM: "let"}, {ORTH: "'s", NORM: "us"}],
"notve": [ "c'mon": [{ORTH: "c'm", NORM: "come"}, {ORTH: "on"}],
{ORTH: "not", LEMMA: "not", TAG: "RB"}, "C'mon": [{ORTH: "C'm", NORM: "come"}, {ORTH: "on"}],
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
],
"Not've": [
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"},
],
"Notve": [
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"},
],
"cannot": [
{ORTH: "can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"},
],
"Cannot": [
{ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"},
],
"gonna": [
{ORTH: "gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to", NORM: "to"},
],
"Gonna": [
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to", NORM: "to"},
],
"gotta": [{ORTH: "got"}, {ORTH: "ta", LEMMA: "to", NORM: "to"}],
"Gotta": [{ORTH: "Got", NORM: "got"}, {ORTH: "ta", LEMMA: "to", NORM: "to"}],
"let's": [{ORTH: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
"Let's": [
{ORTH: "Let", LEMMA: "let", NORM: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
],
"c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
"C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
} }
_exc.update(_other_exc) _exc.update(_other_exc)
for exc_data in [ for exc_data in [
{ORTH: "'S", LEMMA: "'s", NORM: "'s"}, {ORTH: "'S", NORM: "'s"},
{ORTH: "'s", LEMMA: "'s", NORM: "'s"}, {ORTH: "'s", NORM: "'s"},
{ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"}, {ORTH: "\u2018S", NORM: "'s"},
{ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"}, {ORTH: "\u2018s", NORM: "'s"},
{ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"}, {ORTH: "and/or", NORM: "and/or"},
{ORTH: "w/o", LEMMA: "without", NORM: "without"}, {ORTH: "w/o", NORM: "without"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}, {ORTH: "'re", NORM: "are"},
{ORTH: "'Cause", LEMMA: "because", NORM: "because"}, {ORTH: "'Cause", NORM: "because"},
{ORTH: "'cause", LEMMA: "because", NORM: "because"}, {ORTH: "'cause", NORM: "because"},
{ORTH: "'cos", LEMMA: "because", NORM: "because"}, {ORTH: "'cos", NORM: "because"},
{ORTH: "'Cos", LEMMA: "because", NORM: "because"}, {ORTH: "'Cos", NORM: "because"},
{ORTH: "'coz", LEMMA: "because", NORM: "because"}, {ORTH: "'coz", NORM: "because"},
{ORTH: "'Coz", LEMMA: "because", NORM: "because"}, {ORTH: "'Coz", NORM: "because"},
{ORTH: "'cuz", LEMMA: "because", NORM: "because"}, {ORTH: "'cuz", NORM: "because"},
{ORTH: "'Cuz", LEMMA: "because", NORM: "because"}, {ORTH: "'Cuz", NORM: "because"},
{ORTH: "'bout", LEMMA: "about", NORM: "about"}, {ORTH: "'bout", NORM: "about"},
{ORTH: "ma'am", LEMMA: "madam", NORM: "madam"}, {ORTH: "ma'am", NORM: "madam"},
{ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"}, {ORTH: "Ma'am", NORM: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"}, {ORTH: "o'clock", NORM: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"}, {ORTH: "O'clock", NORM: "o'clock"},
{ORTH: "lovin'", LEMMA: "love", NORM: "loving"}, {ORTH: "lovin'", NORM: "loving"},
{ORTH: "Lovin'", LEMMA: "love", NORM: "loving"}, {ORTH: "Lovin'", NORM: "loving"},
{ORTH: "lovin", LEMMA: "love", NORM: "loving"}, {ORTH: "lovin", NORM: "loving"},
{ORTH: "Lovin", LEMMA: "love", NORM: "loving"}, {ORTH: "Lovin", NORM: "loving"},
{ORTH: "havin'", LEMMA: "have", NORM: "having"}, {ORTH: "havin'", NORM: "having"},
{ORTH: "Havin'", LEMMA: "have", NORM: "having"}, {ORTH: "Havin'", NORM: "having"},
{ORTH: "havin", LEMMA: "have", NORM: "having"}, {ORTH: "havin", NORM: "having"},
{ORTH: "Havin", LEMMA: "have", NORM: "having"}, {ORTH: "Havin", NORM: "having"},
{ORTH: "doin'", LEMMA: "do", NORM: "doing"}, {ORTH: "doin'", NORM: "doing"},
{ORTH: "Doin'", LEMMA: "do", NORM: "doing"}, {ORTH: "Doin'", NORM: "doing"},
{ORTH: "doin", LEMMA: "do", NORM: "doing"}, {ORTH: "doin", NORM: "doing"},
{ORTH: "Doin", LEMMA: "do", NORM: "doing"}, {ORTH: "Doin", NORM: "doing"},
{ORTH: "goin'", LEMMA: "go", NORM: "going"}, {ORTH: "goin'", NORM: "going"},
{ORTH: "Goin'", LEMMA: "go", NORM: "going"}, {ORTH: "Goin'", NORM: "going"},
{ORTH: "goin", LEMMA: "go", NORM: "going"}, {ORTH: "goin", NORM: "going"},
{ORTH: "Goin", LEMMA: "go", NORM: "going"}, {ORTH: "Goin", NORM: "going"},
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"}, {ORTH: "Mt.", NORM: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"}, {ORTH: "Ak.", NORM: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"}, {ORTH: "Ala.", NORM: "Alabama"},
{ORTH: "Apr.", LEMMA: "April", NORM: "April"}, {ORTH: "Apr.", NORM: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"}, {ORTH: "Ariz.", NORM: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"}, {ORTH: "Ark.", NORM: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August", NORM: "August"}, {ORTH: "Aug.", NORM: "August"},
{ORTH: "Calif.", LEMMA: "California", NORM: "California"}, {ORTH: "Calif.", NORM: "California"},
{ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"}, {ORTH: "Colo.", NORM: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"}, {ORTH: "Conn.", NORM: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December", NORM: "December"}, {ORTH: "Dec.", NORM: "December"},
{ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"}, {ORTH: "Del.", NORM: "Delaware"},
{ORTH: "Feb.", LEMMA: "February", NORM: "February"}, {ORTH: "Feb.", NORM: "February"},
{ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"}, {ORTH: "Fla.", NORM: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"}, {ORTH: "Ga.", NORM: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"}, {ORTH: "Ia.", NORM: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"}, {ORTH: "Id.", NORM: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"}, {ORTH: "Ill.", NORM: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"}, {ORTH: "Ind.", NORM: "Indiana"},
{ORTH: "Jan.", LEMMA: "January", NORM: "January"}, {ORTH: "Jan.", NORM: "January"},
{ORTH: "Jul.", LEMMA: "July", NORM: "July"}, {ORTH: "Jul.", NORM: "July"},
{ORTH: "Jun.", LEMMA: "June", NORM: "June"}, {ORTH: "Jun.", NORM: "June"},
{ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"}, {ORTH: "Kan.", NORM: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"}, {ORTH: "Kans.", NORM: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"}, {ORTH: "Ky.", NORM: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"}, {ORTH: "La.", NORM: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March", NORM: "March"}, {ORTH: "Mar.", NORM: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"}, {ORTH: "Mass.", NORM: "Massachusetts"},
{ORTH: "May.", LEMMA: "May", NORM: "May"}, {ORTH: "May.", NORM: "May"},
{ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"}, {ORTH: "Mich.", NORM: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"}, {ORTH: "Minn.", NORM: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"}, {ORTH: "Miss.", NORM: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"}, {ORTH: "N.C.", NORM: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"}, {ORTH: "N.D.", NORM: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"}, {ORTH: "N.H.", NORM: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"}, {ORTH: "N.J.", NORM: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"}, {ORTH: "N.M.", NORM: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"}, {ORTH: "N.Y.", NORM: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"}, {ORTH: "Neb.", NORM: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"}, {ORTH: "Nebr.", NORM: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"}, {ORTH: "Nev.", NORM: "Nevada"},
{ORTH: "Nov.", LEMMA: "November", NORM: "November"}, {ORTH: "Nov.", NORM: "November"},
{ORTH: "Oct.", LEMMA: "October", NORM: "October"}, {ORTH: "Oct.", NORM: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"}, {ORTH: "Okla.", NORM: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"}, {ORTH: "Ore.", NORM: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"}, {ORTH: "Pa.", NORM: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"}, {ORTH: "S.C.", NORM: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September", NORM: "September"}, {ORTH: "Sep.", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September", NORM: "September"}, {ORTH: "Sept.", NORM: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"}, {ORTH: "Tenn.", NORM: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"}, {ORTH: "Va.", NORM: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"}, {ORTH: "Wash.", NORM: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}, {ORTH: "Wis.", NORM: "Wisconsin"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -555,4 +522,4 @@ for string in _exclude:
_exc.pop(string) _exc.pop(string)
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,52 +1,23 @@
from typing import Set, Dict, Callable, Any
from thinc.config import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "es"
stop_words = {"@language_data": "spacy.es.stop_words"}
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.es.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.es.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SpanishDefaults(Language.Defaults): class SpanishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class Spanish(Language): class Spanish(Language):
lang = "es" lang = "es"
Defaults = SpanishDefaults Defaults = SpanishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Spanish"] __all__ = ["Spanish"]

View File

@ -1,13 +1,15 @@
from typing import Union, Iterator, Optional, List, Tuple
from ...symbols import NOUN, PROPN, PRON, VERB, AUX from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span, Token
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc doc = doclike.doc
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
if not len(doc): if not len(doc):
return return
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
@ -28,18 +30,24 @@ def noun_chunks(doclike):
token = next_token(token) token = next_token(token)
def is_verb_token(token): def is_verb_token(token: Token) -> bool:
return token.pos in [VERB, AUX] return token.pos in [VERB, AUX]
def next_token(token): def next_token(token: Token) -> Optional[Token]:
try: try:
return token.nbor() return token.nbor()
except IndexError: except IndexError:
return None return None
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps): def noun_bounds(
doc: Doc,
root: Token,
np_left_deps: List[str],
np_right_deps: List[str],
stop_deps: List[str],
) -> Tuple[Token, Token]:
left_bound = root left_bound = root
for token in reversed(list(root.lefts)): for token in reversed(list(root.lefts)):
if token.dep in np_left_deps: if token.dep in np_left_deps:
@ -50,12 +58,8 @@ def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
left, right = noun_bounds( left, right = noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps doc, token, np_left_deps, np_right_deps, stop_deps
) )
if list( filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
filter( if list(filter(filter_func, doc[left_bound.i : right.i],)):
lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i : right.i],
)
):
break break
else: else:
right_bound = right right_bound = right

View File

@ -1,25 +1,27 @@
from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = { _exc = {
"pal": [{ORTH: "pa", LEMMA: "para"}, {ORTH: "l", LEMMA: "el", NORM: "el"}], "pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
} }
for exc_data in [ for exc_data in [
{ORTH: "", LEMMA: "número"}, {ORTH: ""},
{ORTH: "°C", LEMMA: "grados Celcius"}, {ORTH: "°C"},
{ORTH: "aprox.", LEMMA: "aproximadamente"}, {ORTH: "aprox."},
{ORTH: "dna.", LEMMA: "docena"}, {ORTH: "dna."},
{ORTH: "dpto.", LEMMA: "departamento"}, {ORTH: "dpto."},
{ORTH: "ej.", LEMMA: "ejemplo"}, {ORTH: "ej."},
{ORTH: "esq.", LEMMA: "esquina"}, {ORTH: "esq."},
{ORTH: "pág.", LEMMA: "página"}, {ORTH: "pág."},
{ORTH: "p.ej.", LEMMA: "por ejemplo"}, {ORTH: "p.ej."},
{ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Ud.", NORM: "usted"},
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Vd.", NORM: "usted"},
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Uds.", NORM: "ustedes"},
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Vds.", NORM: "ustedes"},
{ORTH: "vol.", NORM: "volúmen"}, {ORTH: "vol.", NORM: "volúmen"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -27,14 +29,14 @@ for exc_data in [
# Times # Times
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}] _exc["12m."] = [{ORTH: "12"}, {ORTH: "m."}]
for h in range(1, 12 + 1): for h in range(1, 12 + 1):
for period in ["a.m.", "am"]: for period in ["a.m.", "am"]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period}]
for period in ["p.m.", "pm"]: for period in ["p.m.", "pm"]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}] _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period}]
for orth in [ for orth in [
@ -73,4 +75,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class EstonianDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "et"
stop_words = {"@language_data": "spacy.et.stop_words"}
"""
@registry.language_data("spacy.et.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Estonian(Language): class Estonian(Language):
lang = "et" lang = "et"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = EstonianDefaults
__all__ = ["Estonian"] __all__ = ["Estonian"]

View File

@ -1,41 +1,18 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "eu"
stop_words = {"@language_data": "spacy.eu.stop_words"}
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
"""
@registry.language_data("spacy.eu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.eu.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class BasqueDefaults(Language.Defaults): class BasqueDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Basque(Language): class Basque(Language):
lang = "eu" lang = "eu"
Defaults = BasqueDefaults Defaults = BasqueDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Basque"] __all__ = ["Basque"]

View File

@ -1,55 +1,23 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from ...language import Language
from ...util import update_exc, registry
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
DEFAULT_CONFIG = """
[nlp]
lang = "fa"
stop_words = {"@language_data": "spacy.fa.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.fa.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fa.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class PersianDefaults(Language.Defaults): class PersianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Persian(Language): class Persian(Language):
lang = "fa" lang = "fa"
Defaults = PersianDefaults Defaults = PersianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Persian"] __all__ = ["Persian"]

File diff suppressed because it is too large Load Diff

View File

@ -1,43 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "fi"
stop_words = {"@language_data": "spacy.fi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
"""
@registry.language_data("spacy.fi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class FinnishDefaults(Language.Defaults): class FinnishDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Finnish(Language): class Finnish(Language):
lang = "fi" lang = "fi"
Defaults = FinnishDefaults Defaults = FinnishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Finnish"] __all__ = ["Finnish"]

View File

@ -1,4 +1,6 @@
from ...symbols import ORTH, LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
_exc = {} _exc = {}
@ -6,76 +8,76 @@ _exc = {}
# Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html # Source https://www.cs.tut.fi/~jkorpela/kielenopas/5.5.html
for exc_data in [ for exc_data in [
{ORTH: "aik.", LEMMA: "aikaisempi"}, {ORTH: "aik."},
{ORTH: "alk.", LEMMA: "alkaen"}, {ORTH: "alk."},
{ORTH: "alv.", LEMMA: "arvonlisävero"}, {ORTH: "alv."},
{ORTH: "ark.", LEMMA: "arkisin"}, {ORTH: "ark."},
{ORTH: "as.", LEMMA: "asunto"}, {ORTH: "as."},
{ORTH: "eaa.", LEMMA: "ennen ajanlaskun alkua"}, {ORTH: "eaa."},
{ORTH: "ed.", LEMMA: "edellinen"}, {ORTH: "ed."},
{ORTH: "esim.", LEMMA: "esimerkki"}, {ORTH: "esim."},
{ORTH: "huom.", LEMMA: "huomautus"}, {ORTH: "huom."},
{ORTH: "jne.", LEMMA: "ja niin edelleen"}, {ORTH: "jne."},
{ORTH: "joht.", LEMMA: "johtaja"}, {ORTH: "joht."},
{ORTH: "k.", LEMMA: "kuollut"}, {ORTH: "k."},
{ORTH: "ks.", LEMMA: "katso"}, {ORTH: "ks."},
{ORTH: "lk.", LEMMA: "luokka"}, {ORTH: "lk."},
{ORTH: "lkm.", LEMMA: "lukumäärä"}, {ORTH: "lkm."},
{ORTH: "lyh.", LEMMA: "lyhenne"}, {ORTH: "lyh."},
{ORTH: "läh.", LEMMA: "lähettäjä"}, {ORTH: "läh."},
{ORTH: "miel.", LEMMA: "mieluummin"}, {ORTH: "miel."},
{ORTH: "milj.", LEMMA: "miljoona"}, {ORTH: "milj."},
{ORTH: "Mm.", LEMMA: "muun muassa"}, {ORTH: "Mm."},
{ORTH: "mm.", LEMMA: "muun muassa"}, {ORTH: "mm."},
{ORTH: "myöh.", LEMMA: "myöhempi"}, {ORTH: "myöh."},
{ORTH: "n.", LEMMA: "noin"}, {ORTH: "n."},
{ORTH: "nimim.", LEMMA: "nimimerkki"}, {ORTH: "nimim."},
{ORTH: "n:o", LEMMA: "numero"}, {ORTH: "n:o"},
{ORTH: "N:o", LEMMA: "numero"}, {ORTH: "N:o"},
{ORTH: "nro", LEMMA: "numero"}, {ORTH: "nro"},
{ORTH: "ns.", LEMMA: "niin sanottu"}, {ORTH: "ns."},
{ORTH: "nyk.", LEMMA: "nykyinen"}, {ORTH: "nyk."},
{ORTH: "oik.", LEMMA: "oikealla"}, {ORTH: "oik."},
{ORTH: "os.", LEMMA: "osoite"}, {ORTH: "os."},
{ORTH: "p.", LEMMA: "päivä"}, {ORTH: "p."},
{ORTH: "par.", LEMMA: "paremmin"}, {ORTH: "par."},
{ORTH: "per.", LEMMA: "perustettu"}, {ORTH: "per."},
{ORTH: "pj.", LEMMA: "puheenjohtaja"}, {ORTH: "pj."},
{ORTH: "puh.joht.", LEMMA: "puheenjohtaja"}, {ORTH: "puh.joht."},
{ORTH: "prof.", LEMMA: "professori"}, {ORTH: "prof."},
{ORTH: "puh.", LEMMA: "puhelin"}, {ORTH: "puh."},
{ORTH: "pvm.", LEMMA: "päivämäärä"}, {ORTH: "pvm."},
{ORTH: "rak.", LEMMA: "rakennettu"}, {ORTH: "rak."},
{ORTH: "ry.", LEMMA: "rekisteröity yhdistys"}, {ORTH: "ry."},
{ORTH: "s.", LEMMA: "sivu"}, {ORTH: "s."},
{ORTH: "siht.", LEMMA: "sihteeri"}, {ORTH: "siht."},
{ORTH: "synt.", LEMMA: "syntynyt"}, {ORTH: "synt."},
{ORTH: "t.", LEMMA: "toivoo"}, {ORTH: "t."},
{ORTH: "tark.", LEMMA: "tarkastanut"}, {ORTH: "tark."},
{ORTH: "til.", LEMMA: "tilattu"}, {ORTH: "til."},
{ORTH: "tms.", LEMMA: "tai muuta sellaista"}, {ORTH: "tms."},
{ORTH: "toim.", LEMMA: "toimittanut"}, {ORTH: "toim."},
{ORTH: "v.", LEMMA: "vuosi"}, {ORTH: "v."},
{ORTH: "vas.", LEMMA: "vasen"}, {ORTH: "vas."},
{ORTH: "vast.", LEMMA: "vastaus"}, {ORTH: "vast."},
{ORTH: "vrt.", LEMMA: "vertaa"}, {ORTH: "vrt."},
{ORTH: "yht.", LEMMA: "yhteensä"}, {ORTH: "yht."},
{ORTH: "yl.", LEMMA: "yleinen"}, {ORTH: "yl."},
{ORTH: "ym.", LEMMA: "ynnä muuta"}, {ORTH: "ym."},
{ORTH: "yms.", LEMMA: "ynnä muuta sellaista"}, {ORTH: "yms."},
{ORTH: "yo.", LEMMA: "ylioppilas"}, {ORTH: "yo."},
{ORTH: "yliopp.", LEMMA: "ylioppilas"}, {ORTH: "yliopp."},
{ORTH: "ao.", LEMMA: "asianomainen"}, {ORTH: "ao."},
{ORTH: "em.", LEMMA: "edellä mainittu"}, {ORTH: "em."},
{ORTH: "ko.", LEMMA: "kyseessä oleva"}, {ORTH: "ko."},
{ORTH: "ml.", LEMMA: "mukaan luettuna"}, {ORTH: "ml."},
{ORTH: "po.", LEMMA: "puheena oleva"}, {ORTH: "po."},
{ORTH: "so.", LEMMA: "se on"}, {ORTH: "so."},
{ORTH: "ts.", LEMMA: "toisin sanoen"}, {ORTH: "ts."},
{ORTH: "vm.", LEMMA: "viimeksi mainittu"}, {ORTH: "vm."},
{ORTH: "srk.", LEMMA: "seurakunta"}, {ORTH: "srk."},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
@ -6,56 +6,47 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import FrenchLemmatizer, is_base_form
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from .lemmatizer import FrenchLemmatizer, is_base_form
from ...lookups import load_lookups
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "fr"
stop_words = {"@language_data": "spacy.fr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.FrenchLemmatizer.v1" @lemmatizers = "spacy.fr.FrenchLemmatizer"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
""" """
@registry.lemmatizers("spacy.FrenchLemmatizer.v1") @registry.lemmatizers("spacy.fr.FrenchLemmatizer")
def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer: def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]:
return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form) tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form)
@registry.language_data("spacy.fr.stop_words") return lemmatizer_factory
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fr.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class FrenchDefaults(Language.Defaults): class FrenchDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class French(Language): class French(Language):
lang = "fr" lang = "fr"
Defaults = FrenchDefaults Defaults = FrenchDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["French"] __all__ = ["French"]

View File

@ -1,26 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span. # fmt: off
""" labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
labels = [ # fmt: on
"nsubj",
"nsubj:pass",
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")

View File

@ -1,8 +1,11 @@
import re import re
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import ELISION, HYPHENS from .punctuation import ELISION, HYPHENS
from ..char_classes import ALPHA_LOWER, ALPHA from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA from ...symbols import ORTH
from ...util import update_exc
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer # not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS # from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
@ -25,29 +28,29 @@ def lower_first_letter(text):
return text[0].lower() + text[1:] return text[0].lower() + text[1:]
_exc = {"J.-C.": [{LEMMA: "Jésus", ORTH: "J."}, {LEMMA: "Christ", ORTH: "-C."}]} _exc = {"J.-C.": [{ORTH: "J."}, {ORTH: "-C."}]}
for exc_data in [ for exc_data in [
{LEMMA: "avant", ORTH: "av."}, {ORTH: "av."},
{LEMMA: "janvier", ORTH: "janv."}, {ORTH: "janv."},
{LEMMA: "février", ORTH: "févr."}, {ORTH: "févr."},
{LEMMA: "avril", ORTH: "avr."}, {ORTH: "avr."},
{LEMMA: "juillet", ORTH: "juill."}, {ORTH: "juill."},
{LEMMA: "septembre", ORTH: "sept."}, {ORTH: "sept."},
{LEMMA: "octobre", ORTH: "oct."}, {ORTH: "oct."},
{LEMMA: "novembre", ORTH: "nov."}, {ORTH: "nov."},
{LEMMA: "décembre", ORTH: "déc."}, {ORTH: "déc."},
{LEMMA: "après", ORTH: "apr."}, {ORTH: "apr."},
{LEMMA: "docteur", ORTH: "Dr."}, {ORTH: "Dr."},
{LEMMA: "monsieur", ORTH: "M."}, {ORTH: "M."},
{LEMMA: "monsieur", ORTH: "Mr."}, {ORTH: "Mr."},
{LEMMA: "madame", ORTH: "Mme."}, {ORTH: "Mme."},
{LEMMA: "mademoiselle", ORTH: "Mlle."}, {ORTH: "Mlle."},
{LEMMA: "numéro", ORTH: ""}, {ORTH: ""},
{LEMMA: "degrés", ORTH: ""}, {ORTH: ""},
{LEMMA: "saint", ORTH: "St."}, {ORTH: "St."},
{LEMMA: "sainte", ORTH: "Ste."}, {ORTH: "Ste."},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -77,55 +80,37 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
for verb, verb_lemma in [ for verb in [
("a", "avoir"), "a",
("est", "être"), "est" "semble",
("semble", "sembler"), "indique",
("indique", "indiquer"), "moque",
("moque", "moquer"), "passe",
("passe", "passer"),
]: ]:
for orth in [verb, verb.title()]: for orth in [verb, verb.title()]:
for pronoun in ["elle", "il", "on"]: for pronoun in ["elle", "il", "on"]:
token = f"{orth}-t-{pronoun}" token = f"{orth}-t-{pronoun}"
_exc[token] = [ _exc[token] = [{ORTH: orth}, {ORTH: "-t"}, {ORTH: "-" + pronoun}]
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
{LEMMA: "t", ORTH: "-t"},
{LEMMA: pronoun, ORTH: "-" + pronoun},
]
for verb, verb_lemma in [("est", "être")]: for verb in ["est"]:
for orth in [verb, verb.title()]: for orth in [verb, verb.title()]:
token = f"{orth}-ce" _exc[f"{orth}-ce"] = [{ORTH: orth}, {ORTH: "-ce"}]
_exc[token] = [
{LEMMA: verb_lemma, ORTH: orth}, # , TAG: "VERB"},
{LEMMA: "ce", ORTH: "-ce"},
]
for pre, pre_lemma in [("qu'", "que"), ("n'", "ne")]: for pre in ["qu'", "n'"]:
for orth in [pre, pre.title()]: for orth in [pre, pre.title()]:
_exc[f"{orth}est-ce"] = [ _exc[f"{orth}est-ce"] = [{ORTH: orth}, {ORTH: "est"}, {ORTH: "-ce"}]
{LEMMA: pre_lemma, ORTH: orth},
{LEMMA: "être", ORTH: "est"},
{LEMMA: "ce", ORTH: "-ce"},
]
for verb, pronoun in [("est", "il"), ("EST", "IL")]: for verb, pronoun in [("est", "il"), ("EST", "IL")]:
token = "{}-{}".format(verb, pronoun) _exc[f"{verb}-{pronoun}"] = [{ORTH: verb}, {ORTH: "-" + pronoun}]
_exc[token] = [
{LEMMA: "être", ORTH: verb},
{LEMMA: pronoun, ORTH: "-" + pronoun},
]
for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]: for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
token = "{}'{}-{}".format(s, verb, pronoun) _exc[f"{s}'{verb}-{pronoun}"] = [
_exc[token] = [ {ORTH: s + "'"},
{LEMMA: "se", ORTH: s + "'"}, {ORTH: verb},
{LEMMA: "être", ORTH: verb}, {ORTH: "-" + pronoun},
{LEMMA: pronoun, ORTH: "-" + pronoun},
] ]
@ -452,7 +437,7 @@ _regular_exp += [
] ]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
TOKEN_MATCH = re.compile( TOKEN_MATCH = re.compile(
"(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp) "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
).match ).match

View File

@ -1,33 +1,16 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "ga"
stop_words = {"@language_data": "spacy.ga.stop_words"}
"""
@registry.language_data("spacy.ga.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class IrishDefaults(Language.Defaults): class IrishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
class Irish(Language): class Irish(Language):
lang = "ga" lang = "ga"
Defaults = IrishDefaults Defaults = IrishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Irish"] __all__ = ["Irish"]

View File

@ -1,79 +1,65 @@
from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = { _exc = {
"'acha'n": [ "'acha'n": [{ORTH: "'ach", NORM: "gach"}, {ORTH: "a'n", NORM: "aon"}],
{ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, "dem'": [{ORTH: "de", NORM: "de"}, {ORTH: "m'", NORM: "mo"}],
{ORTH: "a'n", LEMMA: "aon", NORM: "aon", POS: DET}, "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
], "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
"dem'": [ "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
{ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},
{ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET},
],
"ded'": [
{ORTH: "de", LEMMA: "de", NORM: "de", POS: ADP},
{ORTH: "d'", LEMMA: "do", NORM: "do", POS: DET},
],
"lem'": [
{ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
{ORTH: "m'", LEMMA: "mo", NORM: "mo", POS: DET},
],
"led'": [
{ORTH: "le", LEMMA: "le", NORM: "le", POS: ADP},
{ORTH: "d'", LEMMA: "mo", NORM: "do", POS: DET},
],
} }
for exc_data in [ for exc_data in [
{ORTH: "'gus", LEMMA: "agus", NORM: "agus", POS: CCONJ}, {ORTH: "'gus", NORM: "agus"},
{ORTH: "'ach", LEMMA: "gach", NORM: "gach", POS: DET}, {ORTH: "'ach", NORM: "gach"},
{ORTH: "ao'", LEMMA: "aon", NORM: "aon"}, {ORTH: "ao'", NORM: "aon"},
{ORTH: "'niar", LEMMA: "aniar", NORM: "aniar", POS: ADV}, {ORTH: "'niar", NORM: "aniar"},
{ORTH: "'níos", LEMMA: "aníos", NORM: "aníos", POS: ADV}, {ORTH: "'níos", NORM: "aníos"},
{ORTH: "'ndiu", LEMMA: "inniu", NORM: "inniu", POS: ADV}, {ORTH: "'ndiu", NORM: "inniu"},
{ORTH: "'nocht", LEMMA: "anocht", NORM: "anocht", POS: ADV}, {ORTH: "'nocht", NORM: "anocht"},
{ORTH: "m'", LEMMA: "mo", POS: DET}, {ORTH: "m'"},
{ORTH: "Aib.", LEMMA: "Aibreán", POS: NOUN}, {ORTH: "Aib."},
{ORTH: "Ath.", LEMMA: "athair", POS: NOUN}, {ORTH: "Ath."},
{ORTH: "Beal.", LEMMA: "Bealtaine", POS: NOUN}, {ORTH: "Beal."},
{ORTH: "a.C.n.", LEMMA: "ante Christum natum", POS: X}, {ORTH: "a.C.n."},
{ORTH: "m.sh.", LEMMA: "mar shampla", POS: ADV}, {ORTH: "m.sh."},
{ORTH: "M.F.", LEMMA: "Meán Fómhair", POS: NOUN}, {ORTH: "M.F."},
{ORTH: "M.Fómh.", LEMMA: "Meán Fómhair", POS: NOUN}, {ORTH: "M.Fómh."},
{ORTH: "D.F.", LEMMA: "Deireadh Fómhair", POS: NOUN}, {ORTH: "D.F."},
{ORTH: "D.Fómh.", LEMMA: "Deireadh Fómhair", POS: NOUN}, {ORTH: "D.Fómh."},
{ORTH: "r.C.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "r.C."},
{ORTH: "R.C.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "R.C."},
{ORTH: "r.Ch.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "r.Ch."},
{ORTH: "r.Chr.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "r.Chr."},
{ORTH: "R.Ch.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "R.Ch."},
{ORTH: "R.Chr.", LEMMA: "roimh Chríost", POS: ADV}, {ORTH: "R.Chr."},
{ORTH: "⁊rl.", LEMMA: "agus araile", POS: ADV}, {ORTH: "⁊rl."},
{ORTH: "srl.", LEMMA: "agus araile", POS: ADV}, {ORTH: "srl."},
{ORTH: "Co.", LEMMA: "contae", POS: NOUN}, {ORTH: "Co."},
{ORTH: "Ean.", LEMMA: "Eanáir", POS: NOUN}, {ORTH: "Ean."},
{ORTH: "Feab.", LEMMA: "Feabhra", POS: NOUN}, {ORTH: "Feab."},
{ORTH: "gCo.", LEMMA: "contae", POS: NOUN}, {ORTH: "gCo."},
{ORTH: ".i.", LEMMA: "eadhon", POS: ADV}, {ORTH: ".i."},
{ORTH: "B'", LEMMA: "ba", POS: AUX}, {ORTH: "B'"},
{ORTH: "b'", LEMMA: "ba", POS: AUX}, {ORTH: "b'"},
{ORTH: "lch.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "lch."},
{ORTH: "Lch.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "Lch."},
{ORTH: "lgh.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "lgh."},
{ORTH: "Lgh.", LEMMA: "leathanach", POS: NOUN}, {ORTH: "Lgh."},
{ORTH: "Lún.", LEMMA: "Lúnasa", POS: NOUN}, {ORTH: "Lún."},
{ORTH: "Már.", LEMMA: "Márta", POS: NOUN}, {ORTH: "Már."},
{ORTH: "Meith.", LEMMA: "Meitheamh", POS: NOUN}, {ORTH: "Meith."},
{ORTH: "Noll.", LEMMA: "Nollaig", POS: NOUN}, {ORTH: "Noll."},
{ORTH: "Samh.", LEMMA: "Samhain", POS: NOUN}, {ORTH: "Samh."},
{ORTH: "tAth.", LEMMA: "athair", POS: NOUN}, {ORTH: "tAth."},
{ORTH: "tUas.", LEMMA: "Uasal", POS: NOUN}, {ORTH: "tUas."},
{ORTH: "teo.", LEMMA: "teoranta", POS: NOUN}, {ORTH: "teo."},
{ORTH: "Teo.", LEMMA: "teoranta", POS: NOUN}, {ORTH: "Teo."},
{ORTH: "Uas.", LEMMA: "Uasal", POS: NOUN}, {ORTH: "Uas."},
{ORTH: "uimh.", LEMMA: "uimhir", POS: NOUN}, {ORTH: "uimh."},
{ORTH: "Uimh.", LEMMA: "uimhir", POS: NOUN}, {ORTH: "Uimh."},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -81,4 +67,4 @@ for orth in ["d'", "D'"]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class GujaratiDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "gu"
stop_words = {"@language_data": "spacy.gu.stop_words"}
"""
@registry.language_data("spacy.gu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Gujarati(Language): class Gujarati(Language):
lang = "gu" lang = "gu"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = GujaratiDefaults
__all__ = ["Gujarati"] __all__ = ["Gujarati"]

View File

@ -1,37 +1,15 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "he"
stop_words = {"@language_data": "spacy.he.stop_words"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
"""
@registry.language_data("spacy.he.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class HebrewDefaults(Language.Defaults): class HebrewDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Hebrew(Language): class Hebrew(Language):
lang = "he" lang = "he"
Defaults = HebrewDefaults Defaults = HebrewDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hebrew"] __all__ = ["Hebrew"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class HindiDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "hi" lex_attr_getters = LEX_ATTRS
stop_words = {"@language_data": "spacy.hi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
"""
@registry.language_data("spacy.hi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Hindi(Language): class Hindi(Language):
lang = "hi" lang = "hi"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = HindiDefaults
__all__ = ["Hindi"] __all__ = ["Hindi"]

View File

@ -1,39 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "hr"
stop_words = {"@language_data": "spacy.hr.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.hr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class CroatianDefaults(Language.Defaults): class CroatianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = STOP_WORDS
class Croatian(Language): class Croatian(Language):
lang = "hr" lang = "hr"
Defaults = CroatianDefaults Defaults = CroatianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Croatian"] __all__ = ["Croatian"]

View File

@ -1,45 +1,21 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "hu"
stop_words = {"@language_data": "spacy.hu.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.hu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class HungarianDefaults(Language.Defaults): class HungarianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
stop_words = STOP_WORDS
class Hungarian(Language): class Hungarian(Language):
lang = "hu" lang = "hu"
Defaults = HungarianDefaults Defaults = HungarianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hungarian"] __all__ = ["Hungarian"]

View File

@ -1,7 +1,9 @@
import re import re
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..punctuation import ALPHA_LOWER, CURRENCY from ..punctuation import ALPHA_LOWER, CURRENCY
from ...symbols import ORTH from ...symbols import ORTH
from ...util import update_exc
_exc = {} _exc = {}
@ -644,5 +646,5 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
) )
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class ArmenianDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "hy" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.hy.stop_words"}
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
"""
@registry.language_data("spacy.hy.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hy.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Armenian(Language): class Armenian(Language):
lang = "hy" lang = "hy"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = ArmenianDefaults
__all__ = ["Armenian"] __all__ = ["Armenian"]

View File

@ -1,53 +1,24 @@
from typing import Set, Dict, Callable, Any
from thinc.config import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "id"
stop_words = {"@language_data": "spacy.id.stop_words"}
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.id.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.id.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Indonesian(Language): class Indonesian(Language):
lang = "id" lang = "id"
Defaults = IndonesianDefaults Defaults = IndonesianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Indonesian"] __all__ = ["Indonesian"]

View File

@ -1,26 +1,20 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """
labels = [ # fmt: off
"nsubj", labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
"nsubj:pass", # fmt: on
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")

View File

@ -1,5 +1,8 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
from ...symbols import ORTH, LEMMA, NORM from ...symbols import ORTH, NORM
from ...util import update_exc
# Daftar singkatan dan Akronim dari: # Daftar singkatan dan Akronim dari:
# https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A # https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
@ -8,53 +11,47 @@ _exc = {}
for orth in ID_BASE_EXCEPTIONS: for orth in ID_BASE_EXCEPTIONS:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
orth_title = orth.title() orth_title = orth.title()
_exc[orth_title] = [{ORTH: orth_title}] _exc[orth_title] = [{ORTH: orth_title}]
orth_caps = orth.upper() orth_caps = orth.upper()
_exc[orth_caps] = [{ORTH: orth_caps}] _exc[orth_caps] = [{ORTH: orth_caps}]
orth_lower = orth.lower() orth_lower = orth.lower()
_exc[orth_lower] = [{ORTH: orth_lower}] _exc[orth_lower] = [{ORTH: orth_lower}]
orth_first_upper = orth[0].upper() + orth[1:] orth_first_upper = orth[0].upper() + orth[1:]
_exc[orth_first_upper] = [{ORTH: orth_first_upper}] _exc[orth_first_upper] = [{ORTH: orth_first_upper}]
if "-" in orth: if "-" in orth:
orth_title = "-".join([part.title() for part in orth.split("-")]) orth_title = "-".join([part.title() for part in orth.split("-")])
_exc[orth_title] = [{ORTH: orth_title}] _exc[orth_title] = [{ORTH: orth_title}]
orth_caps = "-".join([part.upper() for part in orth.split("-")]) orth_caps = "-".join([part.upper() for part in orth.split("-")])
_exc[orth_caps] = [{ORTH: orth_caps}] _exc[orth_caps] = [{ORTH: orth_caps}]
for exc_data in [ for exc_data in [
{ORTH: "Jan.", LEMMA: "Januari", NORM: "Januari"}, {ORTH: "Jan.", NORM: "Januari"},
{ORTH: "Feb.", LEMMA: "Februari", NORM: "Februari"}, {ORTH: "Feb.", NORM: "Februari"},
{ORTH: "Mar.", LEMMA: "Maret", NORM: "Maret"}, {ORTH: "Mar.", NORM: "Maret"},
{ORTH: "Apr.", LEMMA: "April", NORM: "April"}, {ORTH: "Apr.", NORM: "April"},
{ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"}, {ORTH: "Jun.", NORM: "Juni"},
{ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"}, {ORTH: "Jul.", NORM: "Juli"},
{ORTH: "Agu.", LEMMA: "Agustus", NORM: "Agustus"}, {ORTH: "Agu.", NORM: "Agustus"},
{ORTH: "Ags.", LEMMA: "Agustus", NORM: "Agustus"}, {ORTH: "Ags.", NORM: "Agustus"},
{ORTH: "Sep.", LEMMA: "September", NORM: "September"}, {ORTH: "Sep.", NORM: "September"},
{ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"}, {ORTH: "Okt.", NORM: "Oktober"},
{ORTH: "Nov.", LEMMA: "November", NORM: "November"}, {ORTH: "Nov.", NORM: "November"},
{ORTH: "Des.", LEMMA: "Desember", NORM: "Desember"}, {ORTH: "Des.", NORM: "Desember"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
_other_exc = { _other_exc = {
"do'a": [{ORTH: "do'a", LEMMA: "doa", NORM: "doa"}], "do'a": [{ORTH: "do'a", NORM: "doa"}],
"jum'at": [{ORTH: "jum'at", LEMMA: "Jumat", NORM: "Jumat"}], "jum'at": [{ORTH: "jum'at", NORM: "Jumat"}],
"Jum'at": [{ORTH: "Jum'at", LEMMA: "Jumat", NORM: "Jumat"}], "Jum'at": [{ORTH: "Jum'at", NORM: "Jumat"}],
"la'nat": [{ORTH: "la'nat", LEMMA: "laknat", NORM: "laknat"}], "la'nat": [{ORTH: "la'nat", NORM: "laknat"}],
"ma'af": [{ORTH: "ma'af", LEMMA: "maaf", NORM: "maaf"}], "ma'af": [{ORTH: "ma'af", NORM: "maaf"}],
"mu'jizat": [{ORTH: "mu'jizat", LEMMA: "mukjizat", NORM: "mukjizat"}], "mu'jizat": [{ORTH: "mu'jizat", NORM: "mukjizat"}],
"Mu'jizat": [{ORTH: "Mu'jizat", LEMMA: "mukjizat", NORM: "mukjizat"}], "Mu'jizat": [{ORTH: "Mu'jizat", NORM: "mukjizat"}],
"ni'mat": [{ORTH: "ni'mat", LEMMA: "nikmat", NORM: "nikmat"}], "ni'mat": [{ORTH: "ni'mat", NORM: "nikmat"}],
"raka'at": [{ORTH: "raka'at", LEMMA: "rakaat", NORM: "rakaat"}], "raka'at": [{ORTH: "raka'at", NORM: "rakaat"}],
"ta'at": [{ORTH: "ta'at", LEMMA: "taat", NORM: "taat"}], "ta'at": [{ORTH: "ta'at", NORM: "taat"}],
} }
_exc.update(_other_exc) _exc.update(_other_exc)
@ -221,4 +218,4 @@ for orth in [
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class IcelandicDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "is"
stop_words = {"@language_data": "spacy.is.stop_words"}
"""
@registry.language_data("spacy.is.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Icelandic(Language): class Icelandic(Language):
lang = "is" lang = "is"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = IcelandicDefaults
__all__ = ["Icelandic"] __all__ = ["Icelandic"]

View File

@ -1,35 +1,11 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "it"
stop_words = {"@language_data": "spacy.it.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.it.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class ItalianDefaults(Language.Defaults): class ItalianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
@ -38,7 +14,6 @@ class ItalianDefaults(Language.Defaults):
class Italian(Language): class Italian(Language):
lang = "it" lang = "it"
Defaults = ItalianDefaults Defaults = ItalianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Italian"] __all__ = ["Italian"]

View File

@ -1,4 +1,7 @@
from ...symbols import ORTH, LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
_exc = { _exc = {
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}], "all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
@ -7,7 +10,7 @@ _exc = {
"L'art.": [{ORTH: "L'"}, {ORTH: "art."}], "L'art.": [{ORTH: "L'"}, {ORTH: "art."}],
"l'art.": [{ORTH: "l'"}, {ORTH: "art."}], "l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
"nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}], "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
"po'": [{ORTH: "po'", LEMMA: "poco"}], "po'": [{ORTH: "po'"}],
"sett..": [{ORTH: "sett."}, {ORTH: "."}], "sett..": [{ORTH: "sett."}, {ORTH: "."}],
} }
@ -52,4 +55,4 @@ for orth in [
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,4 +1,4 @@
from typing import Optional, Union, Dict, Any, Set from typing import Optional, Union, Dict, Any
from pathlib import Path from pathlib import Path
import srsly import srsly
from collections import namedtuple from collections import namedtuple
@ -20,27 +20,15 @@ from ... import util
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "ja"
stop_words = {"@language_data": "spacy.ja.stop_words"}
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.JapaneseTokenizer.v1" @tokenizers = "spacy.ja.JapaneseTokenizer"
split_mode = null split_mode = null
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
""" """
@registry.language_data("spacy.ja.stop_words") @registry.tokenizers("spacy.ja.JapaneseTokenizer")
def stop_words() -> Set[str]: def create_tokenizer(split_mode: Optional[str] = None):
return STOP_WORDS
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
def create_japanese_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp): def japanese_tokenizer_factory(nlp):
return JapaneseTokenizer(nlp, split_mode=split_mode) return JapaneseTokenizer(nlp, split_mode=split_mode)
@ -50,6 +38,8 @@ def create_japanese_tokenizer(split_mode: Optional[str] = None):
class JapaneseTokenizer(DummyTokenizer): class JapaneseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None: def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
self.vocab = nlp.vocab self.vocab = nlp.vocab
# TODO: is this the right way to do it?
self.vocab.morphology.load_tag_map(TAG_MAP)
self.split_mode = split_mode self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode) self.tokenizer = try_sudachi_import(self.split_mode)
@ -172,14 +162,15 @@ class JapaneseTokenizer(DummyTokenizer):
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
tag_map = TAG_MAP config = Config().from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
class Japanese(Language): class Japanese(Language):
lang = "ja" lang = "ja"
Defaults = JapaneseDefaults Defaults = JapaneseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
# Hold the attributes we need with convenient names # Hold the attributes we need with convenient names

View File

@ -1,33 +1,23 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON, VERB from ...symbols import NOUN, PROPN, PRON, VERB
from ...tokens import Doc, Span
# XXX this can probably be pruned a bit
labels = [
"nsubj",
"nmod",
"dobj",
"nsubjpass",
"pcomp",
"pobj",
"obj",
"obl",
"dative",
"appos",
"attr",
"ROOT",
]
def noun_chunks(obj): # TODO: this can probably be pruned a bit
""" # fmt: off
Detect base noun phrases from a dependency parse. Works on both Doc and Span. labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclike", "obl", "dative", "appos", "attr", "ROOT"]
""" # fmt: on
doc = obj.doc # Ensure works on both Doc and Span.
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
doc.vocab.strings.add("conj") doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced
@ -37,12 +27,10 @@ def noun_chunks(obj):
unseen = [w.i for w in word.subtree if w.i not in seen] unseen = [w.i for w in word.subtree if w.i not in seen]
if not unseen: if not unseen:
continue continue
# this takes care of particles etc. # this takes care of particles etc.
seen.update(j.i for j in word.subtree) seen.update(j.i for j in word.subtree)
# This avoids duplicating embedded clauses # This avoids duplicating embedded clauses
seen.update(range(word.i + 1)) seen.update(range(word.i + 1))
# if the head of this is a verb, mark that and rights seen # if the head of this is a verb, mark that and rights seen
# Don't do the subtree as that can hide other phrases # Don't do the subtree as that can hide other phrases
if word.head.pos == VERB: if word.head.pos == VERB:

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class KannadaDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "kn"
stop_words = {"@language_data": "spacy.kn.stop_words"}
"""
@registry.language_data("spacy.kn.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Kannada(Language): class Kannada(Language):
lang = "kn" lang = "kn"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = KannadaDefaults
__all__ = ["Kannada"] __all__ = ["Kannada"]

View File

@ -1,8 +1,9 @@
from typing import Set, Optional, Any, Dict from typing import Optional, Any, Dict
from thinc.api import Config from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from ...compat import copy_reg from ...compat import copy_reg
@ -11,26 +12,14 @@ from ...util import DummyTokenizer, registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "ko"
stop_words = {"@language_data": "spacy.ko.stop_words"}
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.KoreanTokenizer.v1" @tokenizers = "spacy.ko.KoreanTokenizer"
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
""" """
@registry.language_data("spacy.ko.stop_words") @registry.tokenizers("spacy.ko.KoreanTokenizer")
def stop_words() -> Set[str]: def create_tokenizer():
return STOP_WORDS
@registry.tokenizers("spacy.KoreanTokenizer.v1")
def create_korean_tokenizer():
def korean_tokenizer_factory(nlp): def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp) return KoreanTokenizer(nlp)
@ -40,6 +29,8 @@ def create_korean_tokenizer():
class KoreanTokenizer(DummyTokenizer): class KoreanTokenizer(DummyTokenizer):
def __init__(self, nlp: Optional[Language] = None): def __init__(self, nlp: Optional[Language] = None):
self.vocab = nlp.vocab self.vocab = nlp.vocab
# TODO: is this the right way to do it?
self.vocab.morphology.load_tag_map(TAG_MAP)
MeCab = try_mecab_import() MeCab = try_mecab_import()
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
@ -73,13 +64,15 @@ class KoreanTokenizer(DummyTokenizer):
class KoreanDefaults(Language.Defaults): class KoreanDefaults(Language.Defaults):
tag_map = TAG_MAP config = Config().from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
class Korean(Language): class Korean(Language):
lang = "ko" lang = "ko"
Defaults = KoreanDefaults Defaults = KoreanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
def try_mecab_import() -> None: def try_mecab_import() -> None:

View File

@ -1,49 +1,20 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "lb"
stop_words = {"@language_data": "spacy.lb.stop_words"}
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.lb.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.lb.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class LuxembourgishDefaults(Language.Defaults): class LuxembourgishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Luxembourgish(Language): class Luxembourgish(Language):
lang = "lb" lang = "lb"
Defaults = LuxembourgishDefaults Defaults = LuxembourgishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Luxembourgish"] __all__ = ["Luxembourgish"]

View File

@ -1,4 +1,7 @@
from ...symbols import ORTH, LEMMA, NORM from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
# TODO # TODO
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions) # treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
@ -7,19 +10,19 @@ _exc = {}
# translate / delete what is not necessary # translate / delete what is not necessary
for exc_data in [ for exc_data in [
{ORTH: "t", LEMMA: "et", NORM: "et"}, {ORTH: "t", NORM: "et"},
{ORTH: "T", LEMMA: "et", NORM: "et"}, {ORTH: "T", NORM: "et"},
{ORTH: "'t", LEMMA: "et", NORM: "et"}, {ORTH: "'t", NORM: "et"},
{ORTH: "'T", LEMMA: "et", NORM: "et"}, {ORTH: "'T", NORM: "et"},
{ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"}, {ORTH: "wgl.", NORM: "wannechgelift"},
{ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"}, {ORTH: "M.", NORM: "Monsieur"},
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"}, {ORTH: "Mme.", NORM: "Madame"},
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"}, {ORTH: "Dr.", NORM: "Dokter"},
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"}, {ORTH: "Tel.", NORM: "Telefon"},
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, {ORTH: "asw.", NORM: "an sou weider"},
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, {ORTH: "etc.", NORM: "et cetera"},
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, {ORTH: "bzw.", NORM: "bezéiungsweis"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, {ORTH: "Jan.", NORM: "Januar"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -47,4 +50,4 @@ for orth in [
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,35 +1,18 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "lij"
stop_words = {"@language_data": "spacy.lij.stop_words"}
"""
@registry.language_data("spacy.lij.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class LigurianDefaults(Language.Defaults): class LigurianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
class Ligurian(Language): class Ligurian(Language):
lang = "lij" lang = "lij"
Defaults = LigurianDefaults Defaults = LigurianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Ligurian"] __all__ = ["Ligurian"]

View File

@ -1,50 +1,50 @@
from ...symbols import ORTH, LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
_exc = {} _exc = {}
for raw, lemma in [ for raw in [
("a-a", "a-o"), "a-e",
("a-e", "a-o"), "a-o",
("a-o", "a-o"), "a-i",
("a-i", "a-o"), "a-a",
("co-a", "co-o"), "co-a",
("co-e", "co-o"), "co-e",
("co-i", "co-o"), "co-i",
("co-o", "co-o"), "co-o",
("da-a", "da-o"), "da-a",
("da-e", "da-o"), "da-e",
("da-i", "da-o"), "da-i",
("da-o", "da-o"), "da-o",
("pe-a", "pe-o"), "pe-a",
("pe-e", "pe-o"), "pe-e",
("pe-i", "pe-o"), "pe-i",
("pe-o", "pe-o"), "pe-o",
]: ]:
for orth in [raw, raw.capitalize()]: for orth in [raw, raw.capitalize()]:
_exc[orth] = [{ORTH: orth, LEMMA: lemma}] _exc[orth] = [{ORTH: orth}]
# Prefix + prepositions with à (e.g. "sott'a-o") # Prefix + prepositions with à (e.g. "sott'a-o")
for prep, prep_lemma in [ for prep in [
("a-a", "a-o"), "a-a",
("a-e", "a-o"), "a-e",
("a-o", "a-o"), "a-o",
("a-i", "a-o"), "a-i",
]: ]:
for prefix, prefix_lemma in [ for prefix in [
("sott'", "sotta"), "sott'",
("sott", "sotta"), "sott",
("contr'", "contra"), "contr'",
("contr", "contra"), "contr",
("ch'", "che"), "ch'",
("ch", "che"), "ch",
("s'", "se"), "s'",
("s", "se"), "s",
]: ]:
for prefix_orth in [prefix, prefix.capitalize()]: for prefix_orth in [prefix, prefix.capitalize()]:
_exc[prefix_orth + prep] = [ _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
{ORTH: prefix_orth, LEMMA: prefix_lemma},
{ORTH: prep, LEMMA: prep_lemma},
]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,54 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "lt"
stop_words = {"@language_data": "spacy.lt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.lt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.lt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class LithuanianDefaults(Language.Defaults): class LithuanianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
mod_base_exceptions = { tokenizer_exceptions = TOKENIZER_EXCEPTIONS
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") stop_words = STOP_WORDS
} lex_attr_getters = LEX_ATTRS
del mod_base_exceptions["8)"]
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
class Lithuanian(Language): class Lithuanian(Language):
lang = "lt" lang = "lt"
Defaults = LithuanianDefaults Defaults = LithuanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Lithuanian"] __all__ = ["Lithuanian"]

View File

@ -1,267 +1,15 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH from ...symbols import ORTH
from ...util import update_exc
_exc = {} _exc = {}
for orth in [ for orth in ["n-tosios", "?!"]:
"n-tosios",
"?!",
# "G.",
# "J. E.",
# "J. Em.",
# "J.E.",
# "J.Em.",
# "K.",
# "N.",
# "V.",
# "Vt.",
# "a.",
# "a.k.",
# "a.s.",
# "adv.",
# "akad.",
# "aklg.",
# "akt.",
# "al.",
# "ang.",
# "angl.",
# "aps.",
# "apskr.",
# "apyg.",
# "arbat.",
# "asist.",
# "asm.",
# "asm.k.",
# "asmv.",
# "atk.",
# "atsak.",
# "atsisk.",
# "atsisk.sąsk.",
# "atv.",
# "aut.",
# "avd.",
# "b.k.",
# "baud.",
# "biol.",
# "bkl.",
# "bot.",
# "bt.",
# "buv.",
# "ch.",
# "chem.",
# "corp.",
# "d.",
# "dab.",
# "dail.",
# "dek.",
# "deš.",
# "dir.",
# "dirig.",
# "doc.",
# "dol.",
# "dr.",
# "drp.",
# "dvit.",
# "dėst.",
# "dš.",
# "dž.",
# "e.b.",
# "e.bankas",
# "e.p.",
# "e.parašas",
# "e.paštas",
# "e.v.",
# "e.valdžia",
# "egz.",
# "eil.",
# "ekon.",
# "el.",
# "el.bankas",
# "el.p.",
# "el.parašas",
# "el.paštas",
# "el.valdžia",
# "etc.",
# "ež.",
# "fak.",
# "faks.",
# "feat.",
# "filol.",
# "filos.",
# "g.",
# "gen.",
# "geol.",
# "gerb.",
# "gim.",
# "gr.",
# "gv.",
# "gyd.",
# "gyv.",
# "habil.",
# "inc.",
# "insp.",
# "inž.",
# "ir pan.",
# "ir t. t.",
# "isp.",
# "istor.",
# "it.",
# "just.",
# "k.",
# "k. a.",
# "k.a.",
# "kab.",
# "kand.",
# "kart.",
# "kat.",
# "ketv.",
# "kh.",
# "kl.",
# "kln.",
# "km.",
# "kn.",
# "koresp.",
# "kpt.",
# "kr.",
# "kt.",
# "kub.",
# "kun.",
# "kv.",
# "kyš.",
# "l. e. p.",
# "l.e.p.",
# "lenk.",
# "liet.",
# "lot.",
# "lt.",
# "ltd.",
# "ltn.",
# "m.",
# "m.e..",
# "m.m.",
# "mat.",
# "med.",
# "mgnt.",
# "mgr.",
# "min.",
# "mjr.",
# "ml.",
# "mln.",
# "mlrd.",
# "mob.",
# "mok.",
# "moksl.",
# "mokyt.",
# "mot.",
# "mr.",
# "mst.",
# "mstl.",
# "mėn.",
# "nkt.",
# "no.",
# "nr.",
# "ntk.",
# "nuotr.",
# "op.",
# "org.",
# "orig.",
# "p.",
# "p.d.",
# "p.m.e.",
# "p.s.",
# "pab.",
# "pan.",
# "past.",
# "pav.",
# "pavad.",
# "per.",
# "perd.",
# "pirm.",
# "pl.",
# "plg.",
# "plk.",
# "pr.",
# "pr.Kr.",
# "pranc.",
# "proc.",
# "prof.",
# "prom.",
# "prot.",
# "psl.",
# "pss.",
# "pvz.",
# "pšt.",
# "r.",
# "raj.",
# "red.",
# "rez.",
# "rež.",
# "rus.",
# "rš.",
# "s.",
# "sav.",
# "saviv.",
# "sek.",
# "sekr.",
# "sen.",
# "sh.",
# "sk.",
# "skg.",
# "skv.",
# "skyr.",
# "sp.",
# "spec.",
# "sr.",
# "st.",
# "str.",
# "stud.",
# "sąs.",
# "t.",
# "t. p.",
# "t. y.",
# "t.p.",
# "t.t.",
# "t.y.",
# "techn.",
# "tel.",
# "teol.",
# "th.",
# "tir.",
# "trit.",
# "trln.",
# "tšk.",
# "tūks.",
# "tūkst.",
# "up.",
# "upl.",
# "v.s.",
# "vad.",
# "val.",
# "valg.",
# "ved.",
# "vert.",
# "vet.",
# "vid.",
# "virš.",
# "vlsč.",
# "vnt.",
# "vok.",
# "vs.",
# "vtv.",
# "vv.",
# "vyr.",
# "vyresn.",
# "zool.",
# "Įn",
# "įl.",
# "š.m.",
# "šnek.",
# "šv.",
# "švč.",
# "ž.ū.",
# "žin.",
# "žml.",
# "žr.",
]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
del mod_base_exceptions["8)"]
TOKENIZER_EXCEPTIONS = update_exc(mod_base_exceptions, _exc)

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class LatvianDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "lv"
stop_words = {"@language_data": "spacy.lv.stop_words"}
"""
@registry.language_data("spacy.lv.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Latvian(Language): class Latvian(Language):
lang = "lv" lang = "lv"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = LatvianDefaults
__all__ = ["Latvian"] __all__ = ["Latvian"]

View File

@ -1,26 +1,16 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class MalayalamDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "ml" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.ml.stop_words"}
"""
@registry.language_data("spacy.ml.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Malayalam(Language): class Malayalam(Language):
lang = "ml" lang = "ml"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = MalayalamDefaults
__all__ = ["Malayalam"] __all__ = ["Malayalam"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class MarathiDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "af"
stop_words = {"@language_data": "spacy.mr.stop_words"}
"""
@registry.language_data("spacy.mr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Marathi(Language): class Marathi(Language):
lang = "mr" lang = "mr"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = MarathiDefaults
__all__ = ["Marathi"] __all__ = ["Marathi"]

View File

@ -1,47 +1,23 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "nb"
stop_words = {"@language_data": "spacy.nb.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.nb.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class NorwegianDefaults(Language.Defaults): class NorwegianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class Norwegian(Language): class Norwegian(Language):
lang = "nb" lang = "nb"
Defaults = NorwegianDefaults Defaults = NorwegianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Norwegian"] __all__ = ["Norwegian"]

View File

@ -1,26 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span. # fmt: off
""" labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
labels = [ # fmt: on
"nsubj",
"nsubj:pass",
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")

View File

@ -1,21 +1,23 @@
from ...symbols import ORTH, LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {} _exc = {}
for exc_data in [ for exc_data in [
{ORTH: "jan.", LEMMA: "januar"}, {ORTH: "jan.", NORM: "januar"},
{ORTH: "feb.", LEMMA: "februar"}, {ORTH: "feb.", NORM: "februar"},
{ORTH: "mar.", LEMMA: "mars"}, {ORTH: "mar.", NORM: "mars"},
{ORTH: "apr.", LEMMA: "april"}, {ORTH: "apr.", NORM: "april"},
{ORTH: "jun.", LEMMA: "juni"}, {ORTH: "jun.", NORM: "juni"},
{ORTH: "jul.", LEMMA: "juli"}, {ORTH: "jul.", NORM: "juli"},
{ORTH: "aug.", LEMMA: "august"}, {ORTH: "aug.", NORM: "august"},
{ORTH: "sep.", LEMMA: "september"}, {ORTH: "sep.", NORM: "september"},
{ORTH: "okt.", LEMMA: "oktober"}, {ORTH: "okt.", NORM: "oktober"},
{ORTH: "nov.", LEMMA: "november"}, {ORTH: "nov.", NORM: "november"},
{ORTH: "des.", LEMMA: "desember"}, {ORTH: "des.", NORM: "desember"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -218,4 +220,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class NepaliDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "ne" lex_attr_getters = LEX_ATTRS
stop_words = {"@language_data": "spacy.ne.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
"""
@registry.language_data("spacy.ne.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ne.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Nepali(Language): class Nepali(Language):
lang = "ne" lang = "ne"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = NepaliDefaults
__all__ = ["Nepali"] __all__ = ["Nepali"]

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...attrs import NORM, LIKE_NUM from ...attrs import NORM, LIKE_NUM

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -7,52 +7,43 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer from .lemmatizer import DutchLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...lookups import load_lookups
from ...language import Language from ...language import Language
from ...util import update_exc, registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "nl"
stop_words = {"@language_data": "spacy.nl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.DutchLemmatizer.v1" @lemmatizers = "spacy.nl.DutchLemmatizer"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
""" """
@registry.language_data("spacy.nl.stop_words") @registry.lemmatizers("spacy.nl.DutchLemmatizer")
def stop_words() -> Set[str]: def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]:
return STOP_WORDS tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
def lemmatizer_factory(nlp: Language) -> DutchLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return DutchLemmatizer(lookups=lookups)
@registry.language_data("spacy.nl.lex_attr_getters") return lemmatizer_factory
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
return DutchLemmatizer(data_paths=data_paths)
class DutchDefaults(Language.Defaults): class DutchDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Dutch(Language): class Dutch(Language):
lang = "nl" lang = "nl"
Defaults = DutchDefaults Defaults = DutchDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Dutch"] __all__ = ["Dutch"]

View File

@ -1,4 +1,7 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH from ...symbols import ORTH
from ...util import update_exc
# Extensive list of both common and uncommon dutch abbreviations copied from # Extensive list of both common and uncommon dutch abbreviations copied from
# github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based # github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
@ -1602,4 +1605,4 @@ for orth in abbrevs:
_exc[i] = [{ORTH: i}] _exc[i] = [{ORTH: i}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,4 +1,4 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -7,54 +7,53 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import PolishLemmatizer from .lemmatizer import PolishLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...lookups import load_lookups
from ...language import Language from ...language import Language
from ...util import registry from ...util import registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "pl"
stop_words = {"@language_data": "spacy.pl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.PolishLemmatizer.v1" @lemmatizers = "spacy.pl.PolishLemmatizer"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
""" """
TOKENIZER_EXCEPTIONS = {
@registry.language_data("spacy.pl.stop_words") exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
def stop_words() -> Set[str]: }
return STOP_WORDS
@registry.language_data("spacy.pl.lex_attr_getters") @registry.lemmatizers("spacy.pl.PolishLemmatizer")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]: def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]:
return LEX_ATTRS # fmt: off
tables = [
"lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
"lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
"lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
]
# fmt: on
def lemmatizer_factory(nlp: Language) -> PolishLemmatizer:
lookups = load_lookups(lang=nlp.lang, tables=tables)
return PolishLemmatizer(lookups=lookups)
@registry.lemmatizers("spacy.PolishLemmatizer.v1") return lemmatizer_factory
def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
return PolishLemmatizer(data_paths=data_paths)
class PolishDefaults(Language.Defaults): class PolishDefaults(Language.Defaults):
mod_base_exceptions = { config = Config().from_str(DEFAULT_CONFIG)
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") tokenizer_exceptions = TOKENIZER_EXCEPTIONS
}
tokenizer_exceptions = mod_base_exceptions
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Polish(Language): class Polish(Language):
lang = "pl" lang = "pl"
Defaults = PolishDefaults Defaults = PolishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Polish"] __all__ = ["Polish"]

View File

@ -1,50 +1,21 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "pt"
stop_words = {"@language_data": "spacy.pt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.pt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.pt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class PortugueseDefaults(Language.Defaults): class PortugueseDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Portuguese(Language): class Portuguese(Language):
lang = "pt" lang = "pt"
Defaults = PortugueseDefaults Defaults = PortugueseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Portuguese"] __all__ = ["Portuguese"]

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH from ...symbols import ORTH
from ...util import update_exc
_exc = {} _exc = {}
@ -50,4 +52,4 @@ for orth in [
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -3,7 +3,7 @@ from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
_prefixes = ( TOKENIZER_PREFIXES = (
["§", "%", "=", "", "", r"\+(?![0-9])"] ["§", "%", "=", "", "", r"\+(?![0-9])"]
+ LIST_PUNCT + LIST_PUNCT
+ LIST_ELLIPSES + LIST_ELLIPSES
@ -13,7 +13,7 @@ _prefixes = (
) )
_suffixes = ( TOKENIZER_SUFFIXES = (
LIST_PUNCT LIST_PUNCT
+ LIST_ELLIPSES + LIST_ELLIPSES
+ LIST_QUOTES + LIST_QUOTES
@ -31,7 +31,7 @@ _suffixes = (
] ]
) )
_infixes = ( TOKENIZER_INFIXES = (
LIST_ELLIPSES LIST_ELLIPSES
+ LIST_ICONS + LIST_ICONS
+ [ + [
@ -44,7 +44,3 @@ _infixes = (
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

View File

@ -1,49 +1,27 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
# Lemma data note: # Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/ # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
# Replaced characters using cedillas with the correct ones (ș and ț) # Replaced characters using cedillas with the correct ones (ș and ț)
DEFAULT_CONFIG = """
[nlp]
lang = "ro"
stop_words = {"@language_data": "spacy.ro.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.ro.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class RomanianDefaults(Language.Defaults): class RomanianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Romanian(Language): class Romanian(Language):
lang = "ro" lang = "ro"
Defaults = RomanianDefaults Defaults = RomanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Romanian"] __all__ = ["Romanian"]

View File

@ -1,4 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH from ...symbols import ORTH
from ...util import update_exc
from .punctuation import _make_ro_variants from .punctuation import _make_ro_variants
@ -91,4 +93,4 @@ for orth in [
_exc[variant] = [{ORTH: variant}] _exc[variant] = [{ORTH: variant}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,49 +1,40 @@
from typing import Set, Dict, Callable, Any from typing import Callable
from thinc.api import Config from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...util import registry
from ...util import update_exc, registry
from ...language import Language from ...language import Language
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "ru"
stop_words = {"@language_data": "spacy.ru.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
[nlp.lemmatizer] [nlp.lemmatizer]
@lemmatizers = "spacy.RussianLemmatizer.v1" @lemmatizers = "spacy.ru.RussianLemmatizer"
""" """
@registry.language_data("spacy.ru.stop_words") @registry.lemmatizers("spacy.ru.RussianLemmatizer")
def stop_words() -> Set[str]: def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]:
return STOP_WORDS def lemmatizer_factory(nlp: Language) -> RussianLemmatizer:
@registry.language_data("spacy.ru.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
def create_russian_lemmatizer() -> RussianLemmatizer:
return RussianLemmatizer() return RussianLemmatizer()
return lemmatizer_factory
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) config = Config().from_str(DEFAULT_CONFIG)
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Russian(Language): class Russian(Language):
lang = "ru" lang = "ru"
Defaults = RussianDefaults Defaults = RussianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Russian"] __all__ = ["Russian"]

View File

@ -1,66 +1,66 @@
from ...symbols import ORTH, LEMMA, NORM from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {} _exc = {}
_abbrev_exc = [ _abbrev_exc = [
# Weekdays abbreviations # Weekdays abbreviations
{ORTH: "пн", LEMMA: "понедельник", NORM: "понедельник"}, {ORTH: "пн", NORM: "понедельник"},
{ORTH: "вт", LEMMA: "вторник", NORM: "вторник"}, {ORTH: "вт", NORM: "вторник"},
{ORTH: "ср", LEMMA: "среда", NORM: "среда"}, {ORTH: "ср", NORM: "среда"},
{ORTH: "чт", LEMMA: "четверг", NORM: "четверг"}, {ORTH: "чт", NORM: "четверг"},
{ORTH: "чтв", LEMMA: "четверг", NORM: "четверг"}, {ORTH: "чтв", NORM: "четверг"},
{ORTH: "пт", LEMMA: "пятница", NORM: "пятница"}, {ORTH: "пт", NORM: "пятница"},
{ORTH: "сб", LEMMA: "суббота", NORM: "суббота"}, {ORTH: "сб", NORM: "суббота"},
{ORTH: "сбт", LEMMA: "суббота", NORM: "суббота"}, {ORTH: "сбт", NORM: "суббота"},
{ORTH: "вс", LEMMA: "воскресенье", NORM: "воскресенье"}, {ORTH: "вс", NORM: "воскресенье"},
{ORTH: "вскр", LEMMA: "воскресенье", NORM: "воскресенье"}, {ORTH: "вскр", NORM: "воскресенье"},
{ORTH: "воскр", LEMMA: "воскресенье", NORM: "воскресенье"}, {ORTH: "воскр", NORM: "воскресенье"},
# Months abbreviations # Months abbreviations
{ORTH: "янв", LEMMA: "январь", NORM: "январь"}, {ORTH: "янв", NORM: "январь"},
{ORTH: "фев", LEMMA: "февраль", NORM: "февраль"}, {ORTH: "фев", NORM: "февраль"},
{ORTH: "февр", LEMMA: "февраль", NORM: "февраль"}, {ORTH: "февр", NORM: "февраль"},
{ORTH: "мар", LEMMA: "март", NORM: "март"}, {ORTH: "мар", NORM: "март"},
# {ORTH: "март", LEMMA: "март", NORM: "март"}, # {ORTH: "март", NORM: "март"},
{ORTH: "мрт", LEMMA: "март", NORM: "март"}, {ORTH: "мрт", NORM: "март"},
{ORTH: "апр", LEMMA: "апрель", NORM: "апрель"}, {ORTH: "апр", NORM: "апрель"},
# {ORTH: "май", LEMMA: "май", NORM: "май"}, # {ORTH: "май", NORM: "май"},
{ORTH: "июн", LEMMA: "июнь", NORM: "июнь"}, {ORTH: "июн", NORM: "июнь"},
# {ORTH: "июнь", LEMMA: "июнь", NORM: "июнь"}, # {ORTH: "июнь", NORM: "июнь"},
{ORTH: "июл", LEMMA: "июль", NORM: "июль"}, {ORTH: "июл", NORM: "июль"},
# {ORTH: "июль", LEMMA: "июль", NORM: "июль"}, # {ORTH: "июль", NORM: "июль"},
{ORTH: "авг", LEMMA: "август", NORM: "август"}, {ORTH: "авг", NORM: "август"},
{ORTH: "сен", LEMMA: "сентябрь", NORM: "сентябрь"}, {ORTH: "сен", NORM: "сентябрь"},
{ORTH: "сент", LEMMA: "сентябрь", NORM: "сентябрь"}, {ORTH: "сент", NORM: "сентябрь"},
{ORTH: "окт", LEMMA: "октябрь", NORM: "октябрь"}, {ORTH: "окт", NORM: "октябрь"},
{ORTH: "октб", LEMMA: "октябрь", NORM: "октябрь"}, {ORTH: "октб", NORM: "октябрь"},
{ORTH: "ноя", LEMMA: "ноябрь", NORM: "ноябрь"}, {ORTH: "ноя", NORM: "ноябрь"},
{ORTH: "нояб", LEMMA: "ноябрь", NORM: "ноябрь"}, {ORTH: "нояб", NORM: "ноябрь"},
{ORTH: "нбр", LEMMA: "ноябрь", NORM: "ноябрь"}, {ORTH: "нбр", NORM: "ноябрь"},
{ORTH: "дек", LEMMA: "декабрь", NORM: "декабрь"}, {ORTH: "дек", NORM: "декабрь"},
] ]
for abbrev_desc in _abbrev_exc: for abbrev_desc in _abbrev_exc:
abbrev = abbrev_desc[ORTH] abbrev = abbrev_desc[ORTH]
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
_exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] _exc[orth] = [{ORTH: orth, NORM: abbrev_desc[NORM]}]
_exc[orth + "."] = [ _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
{ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}
]
_slang_exc = [ _slang_exc = [
{ORTH: "2к15", LEMMA: "2015", NORM: "2015"}, {ORTH: "2к15", NORM: "2015"},
{ORTH: "2к16", LEMMA: "2016", NORM: "2016"}, {ORTH: "2к16", NORM: "2016"},
{ORTH: "2к17", LEMMA: "2017", NORM: "2017"}, {ORTH: "2к17", NORM: "2017"},
{ORTH: "2к18", LEMMA: "2018", NORM: "2018"}, {ORTH: "2к18", NORM: "2018"},
{ORTH: "2к19", LEMMA: "2019", NORM: "2019"}, {ORTH: "2к19", NORM: "2019"},
{ORTH: "2к20", LEMMA: "2020", NORM: "2020"}, {ORTH: "2к20", NORM: "2020"},
] ]
for slang_desc in _slang_exc: for slang_desc in _slang_exc:
_exc[slang_desc[ORTH]] = [slang_desc] _exc[slang_desc[ORTH]] = [slang_desc]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class SinhalaDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "si" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.si.stop_words"}
lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
"""
@registry.language_data("spacy.si.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.si.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Sinhala(Language): class Sinhala(Language):
lang = "si" lang = "si"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = SinhalaDefaults
__all__ = ["Sinhala"] __all__ = ["Sinhala"]

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class SlovakDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "sk" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.sk.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
"""
@registry.language_data("spacy.sk.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sk.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Slovak(Language): class Slovak(Language):
lang = "sk" lang = "sk"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = SlovakDefaults
__all__ = ["Slovak"] __all__ = ["Slovak"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class SlovenianDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "sl"
stop_words = {"@language_data": "spacy.sl.stop_words"}
"""
@registry.language_data("spacy.sl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Slovenian(Language): class Slovenian(Language):
lang = "sl" lang = "sl"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = SlovenianDefaults
__all__ = ["Slovenian"] __all__ = ["Slovenian"]

View File

@ -1,26 +1,14 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class AlbanianDefaults(Language.Defaults):
[nlp] stop_words = STOP_WORDS
lang = "sq"
stop_words = {"@language_data": "spacy.sq.stop_words"}
"""
@registry.language_data("spacy.sq.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Albanian(Language): class Albanian(Language):
lang = "sq" lang = "sq"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = AlbanianDefaults
__all__ = ["Albanian"] __all__ = ["Albanian"]

View File

@ -1,47 +1,18 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "sr"
stop_words = {"@language_data": "spacy.sr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.sr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sr.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SerbianDefaults(Language.Defaults): class SerbianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Serbian(Language): class Serbian(Language):
lang = "sr" lang = "sr"
Defaults = SerbianDefaults Defaults = SerbianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Serbian"] __all__ = ["Serbian"]

View File

@ -1,93 +1,93 @@
from ...symbols import ORTH, LEMMA, NORM from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = {} _exc = {}
_abbrev_exc = [ _abbrev_exc = [
# Weekdays abbreviations # Weekdays abbreviations
{ORTH: "пoн", LEMMA: "понедељак", NORM: "понедељак"}, {ORTH: "пoн", NORM: "понедељак"},
{ORTH: "уто", LEMMA: "уторак", NORM: "уторак"}, {ORTH: "уто", NORM: "уторак"},
{ORTH: "сре", LEMMA: "среда", NORM: "среда"}, {ORTH: "сре", NORM: "среда"},
{ORTH: "чет", LEMMA: "четвртак", NORM: "четвртак"}, {ORTH: "чет", NORM: "четвртак"},
{ORTH: "пет", LEMMA: "петак", NORM: "петак"}, {ORTH: "пет", NORM: "петак"},
{ORTH: "суб", LEMMA: "субота", NORM: "субота"}, {ORTH: "суб", NORM: "субота"},
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"}, {ORTH: "нед", NORM: "недеља"},
# Months abbreviations # Months abbreviations
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"}, {ORTH: "јан", NORM: "јануар"},
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"}, {ORTH: "феб", NORM: "фебруар"},
{ORTH: "мар", LEMMA: "март", NORM: "март"}, {ORTH: "мар", NORM: "март"},
{ORTH: "апр", LEMMA: "април", NORM: "април"}, {ORTH: "апр", NORM: "април"},
{ORTH: "јуни", LEMMA: "јун", NORM: "јун"}, {ORTH: "јуни", NORM: "јун"},
{ORTH: "јули", LEMMA: "јул", NORM: "јул"}, {ORTH: "јули", NORM: "јул"},
{ORTH: "авг", LEMMA: "август", NORM: "август"}, {ORTH: "авг", NORM: "август"},
{ORTH: "сеп", LEMMA: "септембар", NORM: "септембар"}, {ORTH: "сеп", NORM: "септембар"},
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"}, {ORTH: "септ", NORM: "септембар"},
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"}, {ORTH: "окт", NORM: "октобар"},
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"}, {ORTH: "нов", NORM: "новембар"},
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}, {ORTH: "дец", NORM: "децембар"},
] ]
for abbrev_desc in _abbrev_exc: for abbrev_desc in _abbrev_exc:
abbrev = abbrev_desc[ORTH] abbrev = abbrev_desc[ORTH]
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
_exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] _exc[orth] = [{ORTH: orth, NORM: abbrev_desc[NORM]}]
_exc[orth + "."] = [ _exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
{ORTH: orth + ".", LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}
]
# common abbreviations # common abbreviations
_slang_exc = [ _slang_exc = [
# without dot # without dot
{ORTH: "др", LEMMA: "доктор", NORM: "доктор"}, {ORTH: "др", NORM: "доктор"},
{ORTH: "гдин", LEMMA: "господин", NORM: "господин"}, {ORTH: "гдин", NORM: "господин"},
{ORTH: "гђа", LEMMA: "госпођа", NORM: "госпођа"}, {ORTH: "гђа", NORM: "госпођа"},
{ORTH: "гђица", LEMMA: "госпођица", NORM: "госпођица"}, {ORTH: "гђица", NORM: "госпођица"},
{ORTH: "мр", LEMMA: "магистар", NORM: "магистар"}, {ORTH: "мр", NORM: "магистар"},
{ORTH: "Бгд", LEMMA: "Београд", NORM: "београд"}, {ORTH: "Бгд", NORM: "београд"},
{ORTH: "цм", LEMMA: "центиметар", NORM: "центиметар"}, {ORTH: "цм", NORM: "центиметар"},
{ORTH: "м", LEMMA: "метар", NORM: "метар"}, {ORTH: "м", NORM: "метар"},
{ORTH: "км", LEMMA: "километар", NORM: "километар"}, {ORTH: "км", NORM: "километар"},
{ORTH: "мг", LEMMA: "милиграм", NORM: "милиграм"}, {ORTH: "мг", NORM: "милиграм"},
{ORTH: "кг", LEMMA: "килограм", NORM: "килограм"}, {ORTH: "кг", NORM: "килограм"},
{ORTH: "дл", LEMMA: "децилитар", NORM: "децилитар"}, {ORTH: "дл", NORM: "децилитар"},
{ORTH: "хл", LEMMA: "хектолитар", NORM: "хектолитар"}, {ORTH: "хл", NORM: "хектолитар"},
# with dot # with dot
{ORTH: "ул.", LEMMA: "улица", NORM: "улица"}, {ORTH: "ул.", NORM: "улица"},
{ORTH: "бр.", LEMMA: "број", NORM: "број"}, {ORTH: "бр.", NORM: "број"},
{ORTH: "нпр.", LEMMA: "на пример", NORM: "на пример"}, {ORTH: "нпр.", NORM: "на пример"},
{ORTH: "тзв.", LEMMA: "такозван", NORM: "такозван"}, {ORTH: "тзв.", NORM: "такозван"},
{ORTH: "проф.", LEMMA: "професор", NORM: "професор"}, {ORTH: "проф.", NORM: "професор"},
{ORTH: "стр.", LEMMA: "страна", NORM: "страна"}, {ORTH: "стр.", NORM: "страна"},
{ORTH: "једн.", LEMMA: "једнина", NORM: "једнина"}, {ORTH: "једн.", NORM: "једнина"},
{ORTH: "мн.", LEMMA: "множина", NORM: "множина"}, {ORTH: "мн.", NORM: "множина"},
{ORTH: "уч.", LEMMA: "ученик", NORM: "ученик"}, {ORTH: "уч.", NORM: "ученик"},
{ORTH: "разр.", LEMMA: "разред", NORM: "разред"}, {ORTH: "разр.", NORM: "разред"},
{ORTH: "инж.", LEMMA: "инжењер", NORM: "инжењер"}, {ORTH: "инж.", NORM: "инжењер"},
{ORTH: "гимн.", LEMMA: "гимназија", NORM: "гимназија"}, {ORTH: "гимн.", NORM: "гимназија"},
{ORTH: "год.", LEMMA: "година", NORM: "година"}, {ORTH: "год.", NORM: "година"},
{ORTH: "мед.", LEMMA: "медицина", NORM: "медицина"}, {ORTH: "мед.", NORM: "медицина"},
{ORTH: "гимн.", LEMMA: "гимназија", NORM: "гимназија"}, {ORTH: "гимн.", NORM: "гимназија"},
{ORTH: "акад.", LEMMA: "академик", NORM: "академик"}, {ORTH: "акад.", NORM: "академик"},
{ORTH: "доц.", LEMMA: "доцент", NORM: "доцент"}, {ORTH: "доц.", NORM: "доцент"},
{ORTH: "итд.", LEMMA: "и тако даље", NORM: "и тако даље"}, {ORTH: "итд.", NORM: "и тако даље"},
{ORTH: "и сл.", LEMMA: "и слично", NORM: "и слично"}, {ORTH: "и сл.", NORM: "и слично"},
{ORTH: "н.е.", LEMMA: "нова ера", NORM: "нове ере"}, {ORTH: "н.е.", NORM: "нове ере"},
{ORTH: "о.г.", LEMMA: "ова година", NORM: "ове године"}, {ORTH: "о.г.", NORM: "ове године"},
{ORTH: "л.к.", LEMMA: "лична карта", NORM: "лична карта"}, {ORTH: "л.к.", NORM: "лична карта"},
{ORTH: "в.д.", LEMMA: "вршилац дужности", NORM: "вршилац дужности"}, {ORTH: "в.д.", NORM: "вршилац дужности"},
{ORTH: "стр.", LEMMA: "страна", NORM: "страна"}, {ORTH: "стр.", NORM: "страна"},
# with qoute # with qoute
{ORTH: "ал'", LEMMA: "али", NORM: "али"}, {ORTH: "ал'", NORM: "али"},
{ORTH: "ил'", LEMMA: "или", NORM: "или"}, {ORTH: "ил'", NORM: "или"},
{ORTH: "је л'", LEMMA: "је ли", NORM: "је ли"}, {ORTH: "је л'", NORM: "је ли"},
{ORTH: "да л'", LEMMA: "да ли", NORM: "да ли"}, {ORTH: "да л'", NORM: "да ли"},
{ORTH: "држ'те", LEMMA: "држати", NORM: "држите"}, {ORTH: "држ'те", NORM: "држите"},
] ]
for slang_desc in _slang_exc: for slang_desc in _slang_exc:
_exc[slang_desc[ORTH]] = [slang_desc] _exc[slang_desc[ORTH]] = [slang_desc]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,54 +1,25 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
# Punctuation stolen from Danish # Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
DEFAULT_CONFIG = """
[nlp]
lang = "sv"
stop_words = {"@language_data": "spacy.sv.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.sv.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sv.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SwedishDefaults(Language.Defaults): class SwedishDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class Swedish(Language): class Swedish(Language):
lang = "sv" lang = "sv"
Defaults = SwedishDefaults Defaults = SwedishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Swedish"] __all__ = ["Swedish"]

View File

@ -1,27 +1,18 @@
from typing import Union, Iterator
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
Detect base noun phrases from a dependency parse. Works on both Doc and Span. # fmt: off
""" labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
labels = [ # fmt: on
"nsubj",
"nsubj:pass",
"dobj",
"obj",
"iobj",
"ROOT",
"appos",
"nmod",
"nmod:poss",
]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")

View File

@ -1,4 +1,6 @@
from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import NORM, ORTH
from ...util import update_exc
_exc = {} _exc = {}
@ -8,61 +10,58 @@ _exc = {}
for verb_data in [ for verb_data in [
{ORTH: "driver"}, {ORTH: "driver"},
{ORTH: "kör"}, {ORTH: "kör"},
{ORTH: "hörr", LEMMA: "hör"}, {ORTH: "hörr"},
{ORTH: "fattar"}, {ORTH: "fattar"},
{ORTH: "hajar", LEMMA: "förstår"}, {ORTH: "hajar"},
{ORTH: "lever"}, {ORTH: "lever"},
{ORTH: "serr", LEMMA: "ser"}, {ORTH: "serr"},
{ORTH: "fixar"}, {ORTH: "fixar"},
]: ]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "u"] = [ _exc[data[ORTH] + "u"] = [data, {ORTH: "u", NORM: "du"}]
dict(data),
{ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"},
]
# Abbreviations for weekdays "sön." (for "söndag" / "söner") # Abbreviations for weekdays "sön." (for "söndag" / "söner")
# are left out because they are ambiguous. The same is the case # are left out because they are ambiguous. The same is the case
# for abbreviations "jul." and "Jul." ("juli" / "jul"). # for abbreviations "jul." and "Jul." ("juli" / "jul").
for exc_data in [ for exc_data in [
{ORTH: "jan.", LEMMA: "januari"}, {ORTH: "jan.", NORM: "januari"},
{ORTH: "febr.", LEMMA: "februari"}, {ORTH: "febr.", NORM: "februari"},
{ORTH: "feb.", LEMMA: "februari"}, {ORTH: "feb.", NORM: "februari"},
{ORTH: "apr.", LEMMA: "april"}, {ORTH: "apr.", NORM: "april"},
{ORTH: "jun.", LEMMA: "juni"}, {ORTH: "jun.", NORM: "juni"},
{ORTH: "aug.", LEMMA: "augusti"}, {ORTH: "aug.", NORM: "augusti"},
{ORTH: "sept.", LEMMA: "september"}, {ORTH: "sept.", NORM: "september"},
{ORTH: "sep.", LEMMA: "september"}, {ORTH: "sep.", NORM: "september"},
{ORTH: "okt.", LEMMA: "oktober"}, {ORTH: "okt.", NORM: "oktober"},
{ORTH: "nov.", LEMMA: "november"}, {ORTH: "nov.", NORM: "november"},
{ORTH: "dec.", LEMMA: "december"}, {ORTH: "dec.", NORM: "december"},
{ORTH: "mån.", LEMMA: "måndag"}, {ORTH: "mån.", NORM: "måndag"},
{ORTH: "tis.", LEMMA: "tisdag"}, {ORTH: "tis.", NORM: "tisdag"},
{ORTH: "ons.", LEMMA: "onsdag"}, {ORTH: "ons.", NORM: "onsdag"},
{ORTH: "tors.", LEMMA: "torsdag"}, {ORTH: "tors.", NORM: "torsdag"},
{ORTH: "fre.", LEMMA: "fredag"}, {ORTH: "fre.", NORM: "fredag"},
{ORTH: "lör.", LEMMA: "lördag"}, {ORTH: "lör.", NORM: "lördag"},
{ORTH: "Jan.", LEMMA: "Januari"}, {ORTH: "Jan.", NORM: "Januari"},
{ORTH: "Febr.", LEMMA: "Februari"}, {ORTH: "Febr.", NORM: "Februari"},
{ORTH: "Feb.", LEMMA: "Februari"}, {ORTH: "Feb.", NORM: "Februari"},
{ORTH: "Apr.", LEMMA: "April"}, {ORTH: "Apr.", NORM: "April"},
{ORTH: "Jun.", LEMMA: "Juni"}, {ORTH: "Jun.", NORM: "Juni"},
{ORTH: "Aug.", LEMMA: "Augusti"}, {ORTH: "Aug.", NORM: "Augusti"},
{ORTH: "Sept.", LEMMA: "September"}, {ORTH: "Sept.", NORM: "September"},
{ORTH: "Sep.", LEMMA: "September"}, {ORTH: "Sep.", NORM: "September"},
{ORTH: "Okt.", LEMMA: "Oktober"}, {ORTH: "Okt.", NORM: "Oktober"},
{ORTH: "Nov.", LEMMA: "November"}, {ORTH: "Nov.", NORM: "November"},
{ORTH: "Dec.", LEMMA: "December"}, {ORTH: "Dec.", NORM: "December"},
{ORTH: "Mån.", LEMMA: "Måndag"}, {ORTH: "Mån.", NORM: "Måndag"},
{ORTH: "Tis.", LEMMA: "Tisdag"}, {ORTH: "Tis.", NORM: "Tisdag"},
{ORTH: "Ons.", LEMMA: "Onsdag"}, {ORTH: "Ons.", NORM: "Onsdag"},
{ORTH: "Tors.", LEMMA: "Torsdag"}, {ORTH: "Tors.", NORM: "Torsdag"},
{ORTH: "Fre.", LEMMA: "Fredag"}, {ORTH: "Fre.", NORM: "Fredag"},
{ORTH: "Lör.", LEMMA: "Lördag"}, {ORTH: "Lör.", NORM: "Lördag"},
{ORTH: "sthlm", LEMMA: "Stockholm"}, {ORTH: "sthlm", NORM: "Stockholm"},
{ORTH: "gbg", LEMMA: "Göteborg"}, {ORTH: "gbg", NORM: "Göteborg"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -152,6 +151,6 @@ for orth in ABBREVIATIONS:
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), # Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
# should be tokenized as two separate tokens. # should be tokenized as two separate tokens.
for orth in ["i", "m"]: for orth in ["i", "m"]:
_exc[orth + "."] = [{ORTH: orth, LEMMA: orth, NORM: orth}, {ORTH: "."}] _exc[orth + "."] = [{ORTH: orth, NORM: orth}, {ORTH: "."}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class TamilDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "ta" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.ta.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
"""
@registry.language_data("spacy.ta.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ta.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Tamil(Language): class Tamil(Language):
lang = "ta" lang = "ta"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = TamilDefaults
__all__ = ["Tamil"] __all__ = ["Tamil"]

View File

@ -1,25 +0,0 @@
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB},
"PART": {POS: PART},
"_SP": {POS: SPACE},
}

View File

@ -1,33 +1,16 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language
from ...util import registry
DEFAULT_CONFIG = """ class TeluguDefaults(Language.Defaults):
[nlp] lex_attr_getters = LEX_ATTRS
lang = "te" stop_words = STOP_WORDS
stop_words = {"@language_data": "spacy.te.stop_words"}
lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
"""
@registry.language_data("spacy.te.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.te.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Telugu(Language): class Telugu(Language):
lang = "te" lang = "te"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = TeluguDefaults
__all__ = ["Telugu"] __all__ = ["Telugu"]

View File

@ -1,4 +1,3 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config from thinc.api import Config
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -10,26 +9,13 @@ from ...util import DummyTokenizer, registry
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
[nlp] [nlp]
lang = "th"
stop_words = {"@language_data": "spacy.th.stop_words"}
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.ThaiTokenizer.v1" @tokenizers = "spacy.th.ThaiTokenizer"
""" """
@registry.language_data("spacy.th.stop_words") @registry.tokenizers("spacy.th.ThaiTokenizer")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.th.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.ThaiTokenizer.v1")
def create_thai_tokenizer(): def create_thai_tokenizer():
def thai_tokenizer_factory(nlp): def thai_tokenizer_factory(nlp):
return ThaiTokenizer(nlp) return ThaiTokenizer(nlp)
@ -55,9 +41,15 @@ class ThaiTokenizer(DummyTokenizer):
return Doc(self.vocab, words=words, spaces=spaces) return Doc(self.vocab, words=words, spaces=spaces)
class ThaiDefaults(Language.Defaults):
config = Config().from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Thai(Language): class Thai(Language):
lang = "th" lang = "th"
default_config = Config().from_str(DEFAULT_CONFIG) Defaults = ThaiDefaults
__all__ = ["Thai"] __all__ = ["Thai"]

View File

@ -1,469 +1,438 @@
from ...symbols import ORTH, LEMMA from ...symbols import ORTH
_exc = { _exc = {
# หน่วยงานรัฐ / government agency # หน่วยงานรัฐ / government agency
"กกต.": [{ORTH: "กกต.", LEMMA: "คณะกรรมการการเลือกตั้ง"}], "กกต.": [{ORTH: "กกต."}],
"กทท.": [{ORTH: "กทท.", LEMMA: "การท่าเรือแห่งประเทศไทย"}], "กทท.": [{ORTH: "กทท."}],
"กทพ.": [{ORTH: "กทพ.", LEMMA: "การทางพิเศษแห่งประเทศไทย"}], "กทพ.": [{ORTH: "กทพ."}],
"กบข.": [{ORTH: "กบข.", LEMMA: "กองทุนบำเหน็จบำนาญข้าราชการพลเรือน"}], "กบข.": [{ORTH: "กบข."}],
"กบว.": [{ORTH: "กบว.", LEMMA: "คณะกรรมการบริหารวิทยุกระจายเสียงและวิทยุโทรทัศน์"}], "กบว.": [{ORTH: "กบว."}],
"กปน.": [{ORTH: "กปน.", LEMMA: "การประปานครหลวง"}], "กปน.": [{ORTH: "กปน."}],
"กปภ.": [{ORTH: "กปภ.", LEMMA: "การประปาส่วนภูมิภาค"}], "กปภ.": [{ORTH: "กปภ."}],
"กปส.": [{ORTH: "กปส.", LEMMA: "กรมประชาสัมพันธ์"}], "กปส.": [{ORTH: "กปส."}],
"กผม.": [{ORTH: "กผม.", LEMMA: "กองผังเมือง"}], "กผม.": [{ORTH: "กผม."}],
"กฟน.": [{ORTH: "กฟน.", LEMMA: "การไฟฟ้านครหลวง"}], "กฟน.": [{ORTH: "กฟน."}],
"กฟผ.": [{ORTH: "กฟผ.", LEMMA: "การไฟฟ้าฝ่ายผลิตแห่งประเทศไทย"}], "กฟผ.": [{ORTH: "กฟผ."}],
"กฟภ.": [{ORTH: "กฟภ.", LEMMA: "การไฟฟ้าส่วนภูมิภาค"}], "กฟภ.": [{ORTH: "กฟภ."}],
"ก.ช.น.": [{ORTH: "ก.ช.น.", LEMMA: "คณะกรรมการช่วยเหลือชาวนาชาวไร่"}], "ก.ช.น.": [{ORTH: "ก.ช.น."}],
"กยศ.": [{ORTH: "กยศ.", LEMMA: "กองทุนเงินให้กู้ยืมเพื่อการศึกษา"}], "กยศ.": [{ORTH: "กยศ."}],
"ก.ล.ต.": [{ORTH: "ก.ล.ต.", LEMMA: "คณะกรรมการกำกับหลักทรัพย์และตลาดหลักทรัพย์"}], "ก.ล.ต.": [{ORTH: "ก.ล.ต."}],
"กศ.บ.": [{ORTH: "กศ.บ.", LEMMA: "การศึกษาบัณฑิต"}], "กศ.บ.": [{ORTH: "กศ.บ."}],
"กศน.": [{ORTH: "กศน.", LEMMA: "กรมการศึกษานอกโรงเรียน"}], "กศน.": [{ORTH: "กศน."}],
"กสท.": [{ORTH: "กสท.", LEMMA: "การสื่อสารแห่งประเทศไทย"}], "กสท.": [{ORTH: "กสท."}],
"กอ.รมน.": [{ORTH: "กอ.รมน.", LEMMA: "กองอำนวยการรักษาความมั่นคงภายใน"}], "กอ.รมน.": [{ORTH: "กอ.รมน."}],
"กร.": [{ORTH: "กร.", LEMMA: "กองเรือยุทธการ"}], "กร.": [{ORTH: "กร."}],
"ขสมก.": [{ORTH: "ขสมก.", LEMMA: "องค์การขนส่งมวลชนกรุงเทพ"}], "ขสมก.": [{ORTH: "ขสมก."}],
"คตง.": [{ORTH: "คตง.", LEMMA: "คณะกรรมการตรวจเงินแผ่นดิน"}], "คตง.": [{ORTH: "คตง."}],
"ครม.": [{ORTH: "ครม.", LEMMA: "คณะรัฐมนตรี"}], "ครม.": [{ORTH: "ครม."}],
"คมช.": [{ORTH: "คมช.", LEMMA: "คณะมนตรีความมั่นคงแห่งชาติ"}], "คมช.": [{ORTH: "คมช."}],
"ตชด.": [{ORTH: "ตชด.", LEMMA: "ตำรวจตะเวนชายเดน"}], "ตชด.": [{ORTH: "ตชด."}],
"ตม.": [{ORTH: "ตม.", LEMMA: "กองตรวจคนเข้าเมือง"}], "ตม.": [{ORTH: "ตม."}],
"ตร.": [{ORTH: "ตร.", LEMMA: "ตำรวจ"}], "ตร.": [{ORTH: "ตร."}],
"ททท.": [{ORTH: "ททท.", LEMMA: "การท่องเที่ยวแห่งประเทศไทย"}], "ททท.": [{ORTH: "ททท."}],
"ททบ.": [{ORTH: "ททบ.", LEMMA: "สถานีวิทยุโทรทัศน์กองทัพบก"}], "ททบ.": [{ORTH: "ททบ."}],
"ทบ.": [{ORTH: "ทบ.", LEMMA: "กองทัพบก"}], "ทบ.": [{ORTH: "ทบ."}],
"ทร.": [{ORTH: "ทร.", LEMMA: "กองทัพเรือ"}], "ทร.": [{ORTH: "ทร."}],
"ทอ.": [{ORTH: "ทอ.", LEMMA: "กองทัพอากาศ"}], "ทอ.": [{ORTH: "ทอ."}],
"ทอท.": [{ORTH: "ทอท.", LEMMA: "การท่าอากาศยานแห่งประเทศไทย"}], "ทอท.": [{ORTH: "ทอท."}],
"ธ.ก.ส.": [{ORTH: "ธ.ก.ส.", LEMMA: "ธนาคารเพื่อการเกษตรและสหกรณ์การเกษตร"}], "ธ.ก.ส.": [{ORTH: "ธ.ก.ส."}],
"ธปท.": [{ORTH: "ธปท.", LEMMA: "ธนาคารแห่งประเทศไทย"}], "ธปท.": [{ORTH: "ธปท."}],
"ธอส.": [{ORTH: "ธอส.", LEMMA: "ธนาคารอาคารสงเคราะห์"}], "ธอส.": [{ORTH: "ธอส."}],
"นย.": [{ORTH: "นย.", LEMMA: "นาวิกโยธิน"}], "นย.": [{ORTH: "นย."}],
"ปตท.": [{ORTH: "ปตท.", LEMMA: "การปิโตรเลียมแห่งประเทศไทย"}], "ปตท.": [{ORTH: "ปตท."}],
"ป.ป.ช.": [ "ป.ป.ช.": [{ORTH: "ป.ป.ช."}],
{ "ป.ป.ส.": [{ORTH: "ป.ป.ส."}],
ORTH: "ป.ป.ช.", "บพร.": [{ORTH: "บพร."}],
LEMMA: "คณะกรรมการป้องกันและปราบปรามการทุจริตและประพฤติมิชอบในวงราชการ", "บย.": [{ORTH: "บย."}],
} "พสวท.": [{ORTH: "พสวท."}],
], "มอก.": [{ORTH: "มอก."}],
"ป.ป.ส.": [{ORTH: "ป.ป.ส.", LEMMA: "คณะกรรมการป้องกันและปราบปรามยาเสพติด"}], "ยธ.": [{ORTH: "ยธ."}],
"บพร.": [{ORTH: "บพร.", LEMMA: "กรมการบินพลเรือน"}], "รพช.": [{ORTH: "รพช."}],
"บย.": [{ORTH: "บย.", LEMMA: "กองบินยุทธการ"}], "รฟท.": [{ORTH: "รฟท."}],
"พสวท.": [ "รฟม.": [{ORTH: "รฟม."}],
{ "ศธ.": [{ORTH: "ศธ."}],
ORTH: "พสวท.", "ศนธ.": [{ORTH: "ศนธ."}],
LEMMA: "โครงการพัฒนาและส่งเสริมผู้มีความรู้ความสามารถพิเศษทางวิทยาศาสตร์และเทคโนโลยี", "สกจ.": [{ORTH: "สกจ."}],
} "สกท.": [{ORTH: "สกท."}],
], "สกว.": [{ORTH: "สกว."}],
"มอก.": [{ORTH: "มอก.", LEMMA: "สำนักงานมาตรฐานผลิตภัณฑ์อุตสาหกรรม"}], "สคบ.": [{ORTH: "สคบ."}],
"ยธ.": [{ORTH: "ยธ.", LEMMA: "กรมโยธาธิการ"}], "สจร.": [{ORTH: "สจร."}],
"รพช.": [{ORTH: "รพช.", LEMMA: "สำนักงานเร่งรัดพัฒนาชนบท"}], "สตง.": [{ORTH: "สตง."}],
"รฟท.": [{ORTH: "รฟท.", LEMMA: "การรถไฟแห่งประเทศไทย"}], "สทท.": [{ORTH: "สทท."}],
"รฟม.": [{ORTH: "รฟม.", LEMMA: "การรถไฟฟ้าขนส่งมวลชนแห่งประเทศไทย"}], "สทร.": [{ORTH: "สทร."}],
"ศธ.": [{ORTH: "ศธ.", LEMMA: "กระทรวงศึกษาธิการ"}], "สธ": [{ORTH: "สธ"}],
"ศนธ.": [{ORTH: "ศนธ.", LEMMA: "ศูนย์กลางนิสิตนักศึกษาแห่งประเทศไทย"}], "สนช.": [{ORTH: "สนช."}],
"สกจ.": [{ORTH: "สกจ.", LEMMA: "สหกรณ์จังหวัด"}], "สนนท.": [{ORTH: "สนนท."}],
"สกท.": [{ORTH: "สกท.", LEMMA: "สำนักงานคณะกรรมการส่งเสริมการลงทุน"}], "สปก.": [{ORTH: "สปก."}],
"สกว.": [{ORTH: "สกว.", LEMMA: "สำนักงานกองทุนสนับสนุนการวิจัย"}], "สปช.": [{ORTH: "สปช."}],
"สคบ.": [{ORTH: "สคบ.", LEMMA: "สำนักงานคณะกรรมการคุ้มครองผู้บริโภค"}], "สปอ.": [{ORTH: "สปอ."}],
"สจร.": [{ORTH: "สจร.", LEMMA: "สำนักงานคณะกรรมการจัดระบบการจราจรทางบก"}], "สพช.": [{ORTH: "สพช."}],
"สตง.": [{ORTH: "สตง.", LEMMA: "สำนักงานตรวจเงินแผ่นดิน"}], "สยช.": [{ORTH: "สยช."}],
"สทท.": [{ORTH: "สทท.", LEMMA: "สถานีวิทยุโทรทัศน์แห่งประเทศไทย"}], "สวช.": [{ORTH: "สวช."}],
"สทร.": [{ORTH: "สทร.", LEMMA: "สำนักงานกลางทะเบียนราษฎร์"}], "สวท.": [{ORTH: "สวท."}],
"สธ": [{ORTH: "สธ", LEMMA: "กระทรวงสาธารณสุข"}], "สวทช.": [{ORTH: "สวทช."}],
"สนช.": [{ORTH: "สนช.", LEMMA: "สภานิติบัญญัติแห่งชาติ,สำนักงานนวัตกรรมแห่งชาติ"}], "สคช.": [{ORTH: "สคช."}],
"สนนท.": [{ORTH: "สนนท.", LEMMA: "สหพันธ์นิสิตนักศึกษาแห่งประเทศไทย"}], "สสว.": [{ORTH: "สสว."}],
"สปก.": [{ORTH: "สปก.", LEMMA: "สำนักงานการปฏิรูปที่ดินเพื่อเกษตรกรรม"}], "สสส.": [{ORTH: "สสส."}],
"สปช.": [{ORTH: "สปช.", LEMMA: "สำนักงานคณะกรรมการการประถมศึกษาแห่งชาติ"}], "สสวท.": [{ORTH: "สสวท."}],
"สปอ.": [{ORTH: "สปอ.", LEMMA: "สำนักงานการประถมศึกษาอำเภอ"}], "อตก.": [{ORTH: "อตก."}],
"สพช.": [{ORTH: "สพช.", LEMMA: "สำนักงานคณะกรรมการนโยบายพลังงานแห่งชาติ"}], "อบจ.": [{ORTH: "อบจ."}],
"สยช.": [ "อบต.": [{ORTH: "อบต."}],
{ORTH: "สยช.", LEMMA: "สำนักงานคณะกรรมการส่งเสริมและประสานงานเยาวชนแห่งชาติ"} "อปพร.": [{ORTH: "อปพร."}],
], "อย.": [{ORTH: "อย."}],
"สวช.": [{ORTH: "สวช.", LEMMA: "สำนักงานคณะกรรมการวัฒนธรรมแห่งชาติ"}], "อ.ส.ม.ท.": [{ORTH: "อ.ส.ม.ท."}],
"สวท.": [{ORTH: "สวท.", LEMMA: "สถานีวิทยุกระจายเสียงแห่งประเทศไทย"}],
"สวทช.": [{ORTH: "สวทช.", LEMMA: "สำนักงานพัฒนาวิทยาศาสตร์และเทคโนโลยีแห่งชาติ"}],
"สคช.": [
{ORTH: "สคช.", LEMMA: "สำนักงานคณะกรรมการพัฒนาการเศรษฐกิจและสังคมแห่งชาติ"}
],
"สสว.": [{ORTH: "สสว.", LEMMA: "สำนักงานส่งเสริมวิสาหกิจขนาดกลางและขนาดย่อม"}],
"สสส.": [{ORTH: "สสส.", LEMMA: "สำนักงานกองทุนสนับสนุนการสร้างเสริมสุขภาพ"}],
"สสวท.": [{ORTH: "สสวท.", LEMMA: "สถาบันส่งเสริมการสอนวิทยาศาสตร์และเทคโนโลยี"}],
"อตก.": [{ORTH: "อตก.", LEMMA: "องค์การตลาดเพื่อเกษตรกร"}],
"อบจ.": [{ORTH: "อบจ.", LEMMA: "องค์การบริหารส่วนจังหวัด"}],
"อบต.": [{ORTH: "อบต.", LEMMA: "องค์การบริหารส่วนตำบล"}],
"อปพร.": [{ORTH: "อปพร.", LEMMA: "อาสาสมัครป้องกันภัยฝ่ายพลเรือน"}],
"อย.": [{ORTH: "อย.", LEMMA: "สำนักงานคณะกรรมการอาหารและยา"}],
"อ.ส.ม.ท.": [{ORTH: "อ.ส.ม.ท.", LEMMA: "องค์การสื่อสารมวลชนแห่งประเทศไทย"}],
# มหาวิทยาลัย / สถานศึกษา / university / college # มหาวิทยาลัย / สถานศึกษา / university / college
"มทส.": [{ORTH: "มทส.", LEMMA: "มหาวิทยาลัยเทคโนโลยีสุรนารี"}], "มทส.": [{ORTH: "มทส."}],
"มธ.": [{ORTH: "มธ.", LEMMA: "มหาวิทยาลัยธรรมศาสตร์"}], "มธ.": [{ORTH: "มธ."}],
"ม.อ.": [{ORTH: "ม.อ.", LEMMA: "มหาวิทยาลัยสงขลานครินทร์"}], "ม.อ.": [{ORTH: "ม.อ."}],
"มทร.": [{ORTH: "มทร.", LEMMA: "มหาวิทยาลัยเทคโนโลยีราชมงคล"}], "มทร.": [{ORTH: "มทร."}],
"มมส.": [{ORTH: "มมส.", LEMMA: "มหาวิทยาลัยมหาสารคาม"}], "มมส.": [{ORTH: "มมส."}],
"วท.": [{ORTH: "วท.", LEMMA: "วิทยาลัยเทคนิค"}], "วท.": [{ORTH: "วท."}],
"สตม.": [{ORTH: "สตม.", LEMMA: "สำนักงานตรวจคนเข้าเมือง (ตำรวจ)"}], "สตม.": [{ORTH: "สตม."}],
# ยศ / rank # ยศ / rank
"ดร.": [{ORTH: "ดร.", LEMMA: "ดอกเตอร์"}], "ดร.": [{ORTH: "ดร."}],
"ด.ต.": [{ORTH: "ด.ต.", LEMMA: "ดาบตำรวจ"}], "ด.ต.": [{ORTH: "ด.ต."}],
"จ.ต.": [{ORTH: "จ.ต.", LEMMA: "จ่าตรี"}], "จ.ต.": [{ORTH: "จ.ต."}],
"จ.ท.": [{ORTH: "จ.ท.", LEMMA: "จ่าโท"}], "จ.ท.": [{ORTH: "จ.ท."}],
"จ.ส.ต.": [{ORTH: "จ.ส.ต.", LEMMA: "จ่าสิบตรี (ทหารบก)"}], "จ.ส.ต.": [{ORTH: "จ.ส.ต."}],
"จสต.": [{ORTH: "จสต.", LEMMA: "จ่าสิบตำรวจ"}], "จสต.": [{ORTH: "จสต."}],
"จ.ส.ท.": [{ORTH: "จ.ส.ท.", LEMMA: "จ่าสิบโท"}], "จ.ส.ท.": [{ORTH: "จ.ส.ท."}],
"จ.ส.อ.": [{ORTH: "จ.ส.อ.", LEMMA: "จ่าสิบเอก"}], "จ.ส.อ.": [{ORTH: "จ.ส.อ."}],
"จ.อ.": [{ORTH: "จ.อ.", LEMMA: "จ่าเอก"}], "จ.อ.": [{ORTH: "จ.อ."}],
"ทพญ.": [{ORTH: "ทพญ.", LEMMA: "ทันตแพทย์หญิง"}], "ทพญ.": [{ORTH: "ทพญ."}],
"ทนพ.": [{ORTH: "ทนพ.", LEMMA: "เทคนิคการแพทย์"}], "ทนพ.": [{ORTH: "ทนพ."}],
"นจอ.": [{ORTH: "นจอ.", LEMMA: "นักเรียนจ่าอากาศ"}], "นจอ.": [{ORTH: "นจอ."}],
"น.ช.": [{ORTH: "น.ช.", LEMMA: "นักโทษชาย"}], "น.ช.": [{ORTH: "น.ช."}],
"น.ญ.": [{ORTH: "น.ญ.", LEMMA: "นักโทษหญิง"}], "น.ญ.": [{ORTH: "น.ญ."}],
"น.ต.": [{ORTH: "น.ต.", LEMMA: "นาวาตรี"}], "น.ต.": [{ORTH: "น.ต."}],
"น.ท.": [{ORTH: "น.ท.", LEMMA: "นาวาโท"}], "น.ท.": [{ORTH: "น.ท."}],
"นตท.": [{ORTH: "นตท.", LEMMA: "นักเรียนเตรียมทหาร"}], "นตท.": [{ORTH: "นตท."}],
"นนส.": [{ORTH: "นนส.", LEMMA: "นักเรียนนายสิบทหารบก"}], "นนส.": [{ORTH: "นนส."}],
"นนร.": [{ORTH: "นนร.", LEMMA: "นักเรียนนายร้อย"}], "นนร.": [{ORTH: "นนร."}],
"นนอ.": [{ORTH: "นนอ.", LEMMA: "นักเรียนนายเรืออากาศ"}], "นนอ.": [{ORTH: "นนอ."}],
"นพ.": [{ORTH: "นพ.", LEMMA: "นายแพทย์"}], "นพ.": [{ORTH: "นพ."}],
"นพท.": [{ORTH: "นพท.", LEMMA: "นายแพทย์ทหาร"}], "นพท.": [{ORTH: "นพท."}],
"นรจ.": [{ORTH: "นรจ.", LEMMA: "นักเรียนจ่าทหารเรือ"}], "นรจ.": [{ORTH: "นรจ."}],
"นรต.": [{ORTH: "นรต.", LEMMA: "นักเรียนนายร้อยตำรวจ"}], "นรต.": [{ORTH: "นรต."}],
"นศพ.": [{ORTH: "นศพ.", LEMMA: "นักศึกษาแพทย์"}], "นศพ.": [{ORTH: "นศพ."}],
"นศท.": [{ORTH: "นศท.", LEMMA: "นักศึกษาวิชาทหาร"}], "นศท.": [{ORTH: "นศท."}],
"น.สพ.": [{ORTH: "น.สพ.", LEMMA: "นายสัตวแพทย์ (พ.ร.บ.วิชาชีพการสัตวแพทย์)"}], "น.สพ.": [{ORTH: "น.สพ."}],
"น.อ.": [{ORTH: "น.อ.", LEMMA: "นาวาเอก"}], "น.อ.": [{ORTH: "น.อ."}],
"บช.ก.": [{ORTH: "บช.ก.", LEMMA: "กองบัญชาการตำรวจสอบสวนกลาง"}], "บช.ก.": [{ORTH: "บช.ก."}],
"บช.น.": [{ORTH: "บช.น.", LEMMA: "กองบัญชาการตำรวจนครบาล"}], "บช.น.": [{ORTH: "บช.น."}],
"ผกก.": [{ORTH: "ผกก.", LEMMA: "ผู้กำกับการ"}], "ผกก.": [{ORTH: "ผกก."}],
"ผกก.ภ.": [{ORTH: "ผกก.ภ.", LEMMA: "ผู้กำกับการตำรวจภูธร"}], "ผกก.ภ.": [{ORTH: "ผกก.ภ."}],
"ผจก.": [{ORTH: "ผจก.", LEMMA: "ผู้จัดการ"}], "ผจก.": [{ORTH: "ผจก."}],
"ผช.": [{ORTH: "ผช.", LEMMA: "ผู้ช่วย"}], "ผช.": [{ORTH: "ผช."}],
"ผชก.": [{ORTH: "ผชก.", LEMMA: "ผู้ชำนาญการ"}], "ผชก.": [{ORTH: "ผชก."}],
"ผช.ผอ.": [{ORTH: "ผช.ผอ.", LEMMA: "ผู้ช่วยผู้อำนวยการ"}], "ผช.ผอ.": [{ORTH: "ผช.ผอ."}],
"ผญบ.": [{ORTH: "ผญบ.", LEMMA: "ผู้ใหญ่บ้าน"}], "ผญบ.": [{ORTH: "ผญบ."}],
"ผบ.": [{ORTH: "ผบ.", LEMMA: "ผู้บังคับบัญชา"}], "ผบ.": [{ORTH: "ผบ."}],
"ผบก.": [{ORTH: "ผบก.", LEMMA: "ผู้บังคับบัญชาการ (ตำรวจ)"}], "ผบก.": [{ORTH: "ผบก."}],
"ผบก.น.": [{ORTH: "ผบก.น.", LEMMA: "ผู้บังคับการตำรวจนครบาล"}], "ผบก.น.": [{ORTH: "ผบก.น."}],
"ผบก.ป.": [{ORTH: "ผบก.ป.", LEMMA: "ผู้บังคับการตำรวจกองปราบปราม"}], "ผบก.ป.": [{ORTH: "ผบก.ป."}],
"ผบก.ปค.": [ "ผบก.ปค.": [{ORTH: "ผบก.ปค."}],
{ "ผบก.ปม.": [{ORTH: "ผบก.ปม."}],
ORTH: "ผบก.ปค.", "ผบก.ภ.": [{ORTH: "ผบก.ภ."}],
LEMMA: "ผู้บังคับการ กองบังคับการปกครอง (โรงเรียนนายร้อยตำรวจ)", "ผบช.": [{ORTH: "ผบช."}],
} "ผบช.ก.": [{ORTH: "ผบช.ก."}],
], "ผบช.ตชด.": [{ORTH: "ผบช.ตชด."}],
"ผบก.ปม.": [{ORTH: "ผบก.ปม.", LEMMA: "ผู้บังคับการตำรวจป่าไม้"}], "ผบช.น.": [{ORTH: "ผบช.น."}],
"ผบก.ภ.": [{ORTH: "ผบก.ภ.", LEMMA: "ผู้บังคับการตำรวจภูธร"}], "ผบช.ภ.": [{ORTH: "ผบช.ภ."}],
"ผบช.": [{ORTH: "ผบช.", LEMMA: "ผู้บัญชาการ (ตำรวจ)"}], "ผบ.ทบ.": [{ORTH: "ผบ.ทบ."}],
"ผบช.ก.": [{ORTH: "ผบช.ก.", LEMMA: "ผู้บัญชาการตำรวจสอบสวนกลาง"}], "ผบ.ตร.": [{ORTH: "ผบ.ตร."}],
"ผบช.ตชด.": [{ORTH: "ผบช.ตชด.", LEMMA: "ผู้บัญชาการตำรวจตระเวนชายแดน"}], "ผบ.ทร.": [{ORTH: "ผบ.ทร."}],
"ผบช.น.": [{ORTH: "ผบช.น.", LEMMA: "ผู้บัญชาการตำรวจนครบาล"}], "ผบ.ทอ.": [{ORTH: "ผบ.ทอ."}],
"ผบช.ภ.": [{ORTH: "ผบช.ภ.", LEMMA: "ผู้บัญชาการตำรวจภูธร"}], "ผบ.ทสส.": [{ORTH: "ผบ.ทสส."}],
"ผบ.ทบ.": [{ORTH: "ผบ.ทบ.", LEMMA: "ผู้บัญชาการทหารบก"}], "ผวจ.": [{ORTH: "ผวจ."}],
"ผบ.ตร.": [{ORTH: "ผบ.ตร.", LEMMA: "ผู้บัญชาการตำรวจแห่งชาติ"}], "ผู้ว่าฯ": [{ORTH: "ผู้ว่าฯ"}],
"ผบ.ทร.": [{ORTH: "ผบ.ทร.", LEMMA: "ผู้บัญชาการทหารเรือ"}], "พ.จ.ต.": [{ORTH: "พ.จ.ต."}],
"ผบ.ทอ.": [{ORTH: "ผบ.ทอ.", LEMMA: "ผู้บัญชาการทหารอากาศ"}], "พ.จ.ท.": [{ORTH: "พ.จ.ท."}],
"ผบ.ทสส.": [{ORTH: "ผบ.ทสส.", LEMMA: "ผู้บัญชาการทหารสูงสุด"}], "พ.จ.อ.": [{ORTH: "พ.จ.อ."}],
"ผวจ.": [{ORTH: "ผวจ.", LEMMA: "ผู้ว่าราชการจังหวัด"}], "พญ.": [{ORTH: "พญ."}],
"ผู้ว่าฯ": [{ORTH: "ผู้ว่าฯ", LEMMA: "ผู้ว่าราชการจังหวัด"}], "ฯพณฯ": [{ORTH: "ฯพณฯ"}],
"พ.จ.ต.": [{ORTH: "พ.จ.ต.", LEMMA: "พันจ่าตรี"}], "พ.ต.": [{ORTH: "พ.ต."}],
"พ.จ.ท.": [{ORTH: "พ.จ.ท.", LEMMA: "พันจ่าโท"}], "พ.ท.": [{ORTH: "พ.ท."}],
"พ.จ.อ.": [{ORTH: "พ.จ.อ.", LEMMA: "พันจ่าเอก"}], "พ.อ.": [{ORTH: "พ.อ."}],
"พญ.": [{ORTH: "พญ.", LEMMA: "แพทย์หญิง"}], "พ.ต.อ.พิเศษ": [{ORTH: "พ.ต.อ.พิเศษ"}],
"ฯพณฯ": [{ORTH: "ฯพณฯ", LEMMA: "พณท่าน"}], "พลฯ": [{ORTH: "พลฯ"}],
"พ.ต.": [{ORTH: "พ.ต.", LEMMA: "พันตรี"}], "พล.๑ รอ.": [{ORTH: "พล.๑ รอ."}],
"พ.ท.": [{ORTH: "พ.ท.", LEMMA: "พันโท"}], "พล.ต.": [{ORTH: "พล.ต."}],
"พ.อ.": [{ORTH: "พ.อ.", LEMMA: "พันเอก"}], "พล.ต.ต.": [{ORTH: "พล.ต.ต."}],
"พ.ต.อ.พิเศษ": [{ORTH: "พ.ต.อ.พิเศษ", LEMMA: "พันตำรวจเอกพิเศษ"}], "พล.ต.ท.": [{ORTH: "พล.ต.ท."}],
"พลฯ": [{ORTH: "พลฯ", LEMMA: "พลทหาร"}], "พล.ต.อ.": [{ORTH: "พล.ต.อ."}],
"พล.๑ รอ.": [{ORTH: "พล.๑ รอ.", LEMMA: "กองพลที่ ๑ รักษาพระองค์ กองทัพบก"}], "พล.ท.": [{ORTH: "พล.ท."}],
"พล.ต.": [{ORTH: "พล.ต.", LEMMA: "พลตรี"}], "พล.ปตอ.": [{ORTH: "พล.ปตอ."}],
"พล.ต.ต.": [{ORTH: "พล.ต.ต.", LEMMA: "พลตำรวจตรี"}], "พล.ม.": [{ORTH: "พล.ม."}],
"พล.ต.ท.": [{ORTH: "พล.ต.ท.", LEMMA: "พลตำรวจโท"}], "พล.ม.๒": [{ORTH: "พล.ม.๒"}],
"พล.ต.อ.": [{ORTH: "พล.ต.อ.", LEMMA: "พลตำรวจเอก"}], "พล.ร.ต.": [{ORTH: "พล.ร.ต."}],
"พล.ท.": [{ORTH: "พล.ท.", LEMMA: "พลโท"}], "พล.ร.ท.": [{ORTH: "พล.ร.ท."}],
"พล.ปตอ.": [{ORTH: "พล.ปตอ.", LEMMA: "กองพลทหารปืนใหญ่ต่อสู่อากาศยาน"}], "พล.ร.อ.": [{ORTH: "พล.ร.อ."}],
"พล.ม.": [{ORTH: "พล.ม.", LEMMA: "กองพลทหารม้า"}], "พล.อ.": [{ORTH: "พล.อ."}],
"พล.ม.๒": [{ORTH: "พล.ม.๒", LEMMA: "กองพลทหารม้าที่ ๒"}], "พล.อ.ต.": [{ORTH: "พล.อ.ต."}],
"พล.ร.ต.": [{ORTH: "พล.ร.ต.", LEMMA: "พลเรือตรี"}], "พล.อ.ท.": [{ORTH: "พล.อ.ท."}],
"พล.ร.ท.": [{ORTH: "พล.ร.ท.", LEMMA: "พลเรือโท"}], "พล.อ.อ.": [{ORTH: "พล.อ.อ."}],
"พล.ร.อ.": [{ORTH: "พล.ร.อ.", LEMMA: "พลเรือเอก"}], "พ.อ.พิเศษ": [{ORTH: "พ.อ.พิเศษ"}],
"พล.อ.": [{ORTH: "พล.อ.", LEMMA: "พลเอก"}], "พ.อ.ต.": [{ORTH: "พ.อ.ต."}],
"พล.อ.ต.": [{ORTH: "พล.อ.ต.", LEMMA: "พลอากาศตรี"}], "พ.อ.ท.": [{ORTH: "พ.อ.ท."}],
"พล.อ.ท.": [{ORTH: "พล.อ.ท.", LEMMA: "พลอากาศโท"}], "พ.อ.อ.": [{ORTH: "พ.อ.อ."}],
"พล.อ.อ.": [{ORTH: "พล.อ.อ.", LEMMA: "พลอากาศเอก"}], "ภกญ.": [{ORTH: "ภกญ."}],
"พ.อ.พิเศษ": [{ORTH: "พ.อ.พิเศษ", LEMMA: "พันเอกพิเศษ"}], "ม.จ.": [{ORTH: "ม.จ."}],
"พ.อ.ต.": [{ORTH: "พ.อ.ต.", LEMMA: "พันจ่าอากาศตรี"}], "มท1": [{ORTH: "มท1"}],
"พ.อ.ท.": [{ORTH: "พ.อ.ท.", LEMMA: "พันจ่าอากาศโท"}], "ม.ร.ว.": [{ORTH: "ม.ร.ว."}],
"พ.อ.อ.": [{ORTH: "พ.อ.อ.", LEMMA: "พันจ่าอากาศเอก"}], "มล.": [{ORTH: "มล."}],
"ภกญ.": [{ORTH: "ภกญ.", LEMMA: "เภสัชกรหญิง"}], "ร.ต.": [{ORTH: "ร.ต."}],
"ม.จ.": [{ORTH: "ม.จ.", LEMMA: "หม่อมเจ้า"}], "ร.ต.ต.": [{ORTH: "ร.ต.ต."}],
"มท1": [{ORTH: "มท1", LEMMA: "รัฐมนตรีว่าการกระทรวงมหาดไทย"}], "ร.ต.ท.": [{ORTH: "ร.ต.ท."}],
"ม.ร.ว.": [{ORTH: "ม.ร.ว.", LEMMA: "หม่อมราชวงศ์"}], "ร.ต.อ.": [{ORTH: "ร.ต.อ."}],
"มล.": [{ORTH: "มล.", LEMMA: "หม่อมหลวง"}], "ร.ท.": [{ORTH: "ร.ท."}],
"ร.ต.": [{ORTH: "ร.ต.", LEMMA: "ร้อยตรี,เรือตรี,เรืออากาศตรี"}], "รมช.": [{ORTH: "รมช."}],
"ร.ต.ต.": [{ORTH: "ร.ต.ต.", LEMMA: "ร้อยตำรวจตรี"}], "รมต.": [{ORTH: "รมต."}],
"ร.ต.ท.": [{ORTH: "ร.ต.ท.", LEMMA: "ร้อยตำรวจโท"}], "รมว.": [{ORTH: "รมว."}],
"ร.ต.อ.": [{ORTH: "ร.ต.อ.", LEMMA: "ร้อยตำรวจเอก"}], "รศ.": [{ORTH: "รศ."}],
"ร.ท.": [{ORTH: "ร.ท.", LEMMA: "ร้อยโท,เรือโท,เรืออากาศโท"}], "ร.อ.": [{ORTH: "ร.อ."}],
"รมช.": [{ORTH: "รมช.", LEMMA: "รัฐมนตรีช่วยว่าการกระทรวง"}], "ศ.": [{ORTH: "ศ."}],
"รมต.": [{ORTH: "รมต.", LEMMA: "รัฐมนตรี"}], "ส.ต.": [{ORTH: "ส.ต."}],
"รมว.": [{ORTH: "รมว.", LEMMA: "รัฐมนตรีว่าการกระทรวง"}], "ส.ต.ต.": [{ORTH: "ส.ต.ต."}],
"รศ.": [{ORTH: "รศ.", LEMMA: "รองศาสตราจารย์"}], "ส.ต.ท.": [{ORTH: "ส.ต.ท."}],
"ร.อ.": [{ORTH: "ร.อ.", LEMMA: "ร้อยเอก,เรือเอก,เรืออากาศเอก"}], "ส.ต.อ.": [{ORTH: "ส.ต.อ."}],
"ศ.": [{ORTH: "ศ.", LEMMA: "ศาสตราจารย์"}], "ส.ท.": [{ORTH: "ส.ท."}],
"ส.ต.": [{ORTH: "ส.ต.", LEMMA: "สิบตรี"}], "สพ.": [{ORTH: "สพ."}],
"ส.ต.ต.": [{ORTH: "ส.ต.ต.", LEMMA: "สิบตำรวจตรี"}], "สพ.ญ.": [{ORTH: "สพ.ญ."}],
"ส.ต.ท.": [{ORTH: "ส.ต.ท.", LEMMA: "สิบตำรวจโท"}], "สพ.ช.": [{ORTH: "สพ.ช."}],
"ส.ต.อ.": [{ORTH: "ส.ต.อ.", LEMMA: "สิบตำรวจเอก"}], "ส.อ.": [{ORTH: "ส.อ."}],
"ส.ท.": [{ORTH: "ส.ท.", LEMMA: "สิบโท"}], "อจ.": [{ORTH: "อจ."}],
"สพ.": [{ORTH: "สพ.", LEMMA: "สัตวแพทย์"}], "อจญ.": [{ORTH: "อจญ."}],
"สพ.ญ.": [{ORTH: "สพ.ญ.", LEMMA: "สัตวแพทย์หญิง"}],
"สพ.ช.": [{ORTH: "สพ.ช.", LEMMA: "สัตวแพทย์ชาย"}],
"ส.อ.": [{ORTH: "ส.อ.", LEMMA: "สิบเอก"}],
"อจ.": [{ORTH: "อจ.", LEMMA: "อาจารย์"}],
"อจญ.": [{ORTH: "อจญ.", LEMMA: "อาจารย์ใหญ่"}],
# วุฒิ / bachelor degree # วุฒิ / bachelor degree
"ป.": [{ORTH: "ป.", LEMMA: "ประถมศึกษา"}], "ป.": [{ORTH: "ป."}],
"ป.กศ.": [{ORTH: "ป.กศ.", LEMMA: "ประกาศนียบัตรวิชาการศึกษา"}], "ป.กศ.": [{ORTH: "ป.กศ."}],
"ป.กศ.สูง": [{ORTH: "ป.กศ.สูง", LEMMA: "ประกาศนียบัตรวิชาการศึกษาชั้นสูง"}], "ป.กศ.สูง": [{ORTH: "ป.กศ.สูง"}],
"ปวช.": [{ORTH: "ปวช.", LEMMA: "ประกาศนียบัตรวิชาชีพ"}], "ปวช.": [{ORTH: "ปวช."}],
"ปวท.": [{ORTH: "ปวท.", LEMMA: "ประกาศนียบัตรวิชาชีพเทคนิค"}], "ปวท.": [{ORTH: "ปวท."}],
"ปวส.": [{ORTH: "ปวส.", LEMMA: "ประกาศนียบัตรวิชาชีพชั้นสูง"}], "ปวส.": [{ORTH: "ปวส."}],
"ปทส.": [{ORTH: "ปทส.", LEMMA: "ประกาศนียบัตรครูเทคนิคชั้นสูง"}], "ปทส.": [{ORTH: "ปทส."}],
"กษ.บ.": [{ORTH: "กษ.บ.", LEMMA: "เกษตรศาสตรบัณฑิต"}], "กษ.บ.": [{ORTH: "กษ.บ."}],
"กษ.ม.": [{ORTH: "กษ.ม.", LEMMA: "เกษตรศาสตรมหาบัณฑิต"}], "กษ.ม.": [{ORTH: "กษ.ม."}],
"กษ.ด.": [{ORTH: "กษ.ด.", LEMMA: "เกษตรศาสตรดุษฎีบัณฑิต"}], "กษ.ด.": [{ORTH: "กษ.ด."}],
"ค.บ.": [{ORTH: "ค.บ.", LEMMA: "ครุศาสตรบัณฑิต"}], "ค.บ.": [{ORTH: "ค.บ."}],
"คศ.บ.": [{ORTH: "คศ.บ.", LEMMA: "คหกรรมศาสตรบัณฑิต"}], "คศ.บ.": [{ORTH: "คศ.บ."}],
"คศ.ม.": [{ORTH: "คศ.ม.", LEMMA: "คหกรรมศาสตรมหาบัณฑิต"}], "คศ.ม.": [{ORTH: "คศ.ม."}],
"คศ.ด.": [{ORTH: "คศ.ด.", LEMMA: "คหกรรมศาสตรดุษฎีบัณฑิต"}], "คศ.ด.": [{ORTH: "คศ.ด."}],
"ค.อ.บ.": [{ORTH: "ค.อ.บ.", LEMMA: "ครุศาสตรอุตสาหกรรมบัณฑิต"}], "ค.อ.บ.": [{ORTH: "ค.อ.บ."}],
"ค.อ.ม.": [{ORTH: "ค.อ.ม.", LEMMA: "ครุศาสตรอุตสาหกรรมมหาบัณฑิต"}], "ค.อ.ม.": [{ORTH: "ค.อ.ม."}],
"ค.อ.ด.": [{ORTH: "ค.อ.ด.", LEMMA: "ครุศาสตรอุตสาหกรรมดุษฎีบัณฑิต"}], "ค.อ.ด.": [{ORTH: "ค.อ.ด."}],
"ทก.บ.": [{ORTH: "ทก.บ.", LEMMA: "เทคโนโลยีการเกษตรบัณฑิต"}], "ทก.บ.": [{ORTH: "ทก.บ."}],
"ทก.ม.": [{ORTH: "ทก.ม.", LEMMA: "เทคโนโลยีการเกษตรมหาบัณฑิต"}], "ทก.ม.": [{ORTH: "ทก.ม."}],
"ทก.ด.": [{ORTH: "ทก.ด.", LEMMA: "เทคโนโลยีการเกษตรดุษฎีบัณฑิต"}], "ทก.ด.": [{ORTH: "ทก.ด."}],
"ท.บ.": [{ORTH: "ท.บ.", LEMMA: "ทันตแพทยศาสตรบัณฑิต"}], "ท.บ.": [{ORTH: "ท.บ."}],
"ท.ม.": [{ORTH: "ท.ม.", LEMMA: "ทันตแพทยศาสตรมหาบัณฑิต"}], "ท.ม.": [{ORTH: "ท.ม."}],
"ท.ด.": [{ORTH: "ท.ด.", LEMMA: "ทันตแพทยศาสตรดุษฎีบัณฑิต"}], "ท.ด.": [{ORTH: "ท.ด."}],
"น.บ.": [{ORTH: "น.บ.", LEMMA: "นิติศาสตรบัณฑิต"}], "น.บ.": [{ORTH: "น.บ."}],
"น.ม.": [{ORTH: "น.ม.", LEMMA: "นิติศาสตรมหาบัณฑิต"}], "น.ม.": [{ORTH: "น.ม."}],
"น.ด.": [{ORTH: "น.ด.", LEMMA: "นิติศาสตรดุษฎีบัณฑิต"}], "น.ด.": [{ORTH: "น.ด."}],
"นศ.บ.": [{ORTH: "นศ.บ.", LEMMA: "นิเทศศาสตรบัณฑิต"}], "นศ.บ.": [{ORTH: "นศ.บ."}],
"นศ.ม.": [{ORTH: "นศ.ม.", LEMMA: "นิเทศศาสตรมหาบัณฑิต"}], "นศ.ม.": [{ORTH: "นศ.ม."}],
"นศ.ด.": [{ORTH: "นศ.ด.", LEMMA: "นิเทศศาสตรดุษฎีบัณฑิต"}], "นศ.ด.": [{ORTH: "นศ.ด."}],
"บช.บ.": [{ORTH: "บช.บ.", LEMMA: "บัญชีบัณฑิต"}], "บช.บ.": [{ORTH: "บช.บ."}],
"บช.ม.": [{ORTH: "บช.ม.", LEMMA: "บัญชีมหาบัณฑิต"}], "บช.ม.": [{ORTH: "บช.ม."}],
"บช.ด.": [{ORTH: "บช.ด.", LEMMA: "บัญชีดุษฎีบัณฑิต"}], "บช.ด.": [{ORTH: "บช.ด."}],
"บธ.บ.": [{ORTH: "บธ.บ.", LEMMA: "บริหารธุรกิจบัณฑิต"}], "บธ.บ.": [{ORTH: "บธ.บ."}],
"บธ.ม.": [{ORTH: "บธ.ม.", LEMMA: "บริหารธุรกิจมหาบัณฑิต"}], "บธ.ม.": [{ORTH: "บธ.ม."}],
"บธ.ด.": [{ORTH: "บธ.ด.", LEMMA: "บริหารธุรกิจดุษฎีบัณฑิต"}], "บธ.ด.": [{ORTH: "บธ.ด."}],
"พณ.บ.": [{ORTH: "พณ.บ.", LEMMA: "พาณิชยศาสตรบัณฑิต"}], "พณ.บ.": [{ORTH: "พณ.บ."}],
"พณ.ม.": [{ORTH: "พณ.ม.", LEMMA: "พาณิชยศาสตรมหาบัณฑิต"}], "พณ.ม.": [{ORTH: "พณ.ม."}],
"พณ.ด.": [{ORTH: "พณ.ด.", LEMMA: "พาณิชยศาสตรดุษฎีบัณฑิต"}], "พณ.ด.": [{ORTH: "พณ.ด."}],
"พ.บ.": [{ORTH: "พ.บ.", LEMMA: "แพทยศาสตรบัณฑิต"}], "พ.บ.": [{ORTH: "พ.บ."}],
"พ.ม.": [{ORTH: "พ.ม.", LEMMA: "แพทยศาสตรมหาบัณฑิต"}], "พ.ม.": [{ORTH: "พ.ม."}],
"พ.ด.": [{ORTH: "พ.ด.", LEMMA: "แพทยศาสตรดุษฎีบัณฑิต"}], "พ.ด.": [{ORTH: "พ.ด."}],
"พธ.บ.": [{ORTH: "พธ.บ.", LEMMA: "พุทธศาสตรบัณฑิต"}], "พธ.บ.": [{ORTH: "พธ.บ."}],
"พธ.ม.": [{ORTH: "พธ.ม.", LEMMA: "พุทธศาสตรมหาบัณฑิต"}], "พธ.ม.": [{ORTH: "พธ.ม."}],
"พธ.ด.": [{ORTH: "พธ.ด.", LEMMA: "พุทธศาสตรดุษฎีบัณฑิต"}], "พธ.ด.": [{ORTH: "พธ.ด."}],
"พบ.บ.": [{ORTH: "พบ.บ.", LEMMA: "พัฒนบริหารศาสตรบัณฑิต"}], "พบ.บ.": [{ORTH: "พบ.บ."}],
"พบ.ม.": [{ORTH: "พบ.ม.", LEMMA: "พัฒนบริหารศาสตรมหาบัณฑิต"}], "พบ.ม.": [{ORTH: "พบ.ม."}],
"พบ.ด.": [{ORTH: "พบ.ด.", LEMMA: "พัฒนบริหารศาสตรดุษฎีบัณฑิต"}], "พบ.ด.": [{ORTH: "พบ.ด."}],
"พย.บ.": [{ORTH: "พย.บ.", LEMMA: "พยาบาลศาสตรดุษฎีบัณฑิต"}], "พย.บ.": [{ORTH: "พย.บ."}],
"พย.ม.": [{ORTH: "พย.ม.", LEMMA: "พยาบาลศาสตรมหาบัณฑิต"}], "พย.ม.": [{ORTH: "พย.ม."}],
"พย.ด.": [{ORTH: "พย.ด.", LEMMA: "พยาบาลศาสตรดุษฎีบัณฑิต"}], "พย.ด.": [{ORTH: "พย.ด."}],
"พศ.บ.": [{ORTH: "พศ.บ.", LEMMA: "พาณิชยศาสตรบัณฑิต"}], "พศ.บ.": [{ORTH: "พศ.บ."}],
"พศ.ม.": [{ORTH: "พศ.ม.", LEMMA: "พาณิชยศาสตรมหาบัณฑิต"}], "พศ.ม.": [{ORTH: "พศ.ม."}],
"พศ.ด.": [{ORTH: "พศ.ด.", LEMMA: "พาณิชยศาสตรดุษฎีบัณฑิต"}], "พศ.ด.": [{ORTH: "พศ.ด."}],
"ภ.บ.": [{ORTH: "ภ.บ.", LEMMA: "เภสัชศาสตรบัณฑิต"}], "ภ.บ.": [{ORTH: "ภ.บ."}],
"ภ.ม.": [{ORTH: "ภ.ม.", LEMMA: "เภสัชศาสตรมหาบัณฑิต"}], "ภ.ม.": [{ORTH: "ภ.ม."}],
"ภ.ด.": [{ORTH: "ภ.ด.", LEMMA: "เภสัชศาสตรดุษฎีบัณฑิต"}], "ภ.ด.": [{ORTH: "ภ.ด."}],
"ภ.สถ.บ.": [{ORTH: "ภ.สถ.บ.", LEMMA: "ภูมิสถาปัตยกรรมศาสตรบัณฑิต"}], "ภ.สถ.บ.": [{ORTH: "ภ.สถ.บ."}],
"รป.บ.": [{ORTH: "รป.บ.", LEMMA: "รัฐประศาสนศาสตร์บัณฑิต"}], "รป.บ.": [{ORTH: "รป.บ."}],
"รป.ม.": [{ORTH: "รป.ม.", LEMMA: "รัฐประศาสนศาสตร์มหาบัณฑิต"}], "รป.ม.": [{ORTH: "รป.ม."}],
"วท.บ.": [{ORTH: "วท.บ.", LEMMA: "วิทยาศาสตรบัณฑิต"}], "วท.บ.": [{ORTH: "วท.บ."}],
"วท.ม.": [{ORTH: "วท.ม.", LEMMA: "วิทยาศาสตรมหาบัณฑิต"}], "วท.ม.": [{ORTH: "วท.ม."}],
"วท.ด.": [{ORTH: "วท.ด.", LEMMA: "วิทยาศาสตรดุษฎีบัณฑิต"}], "วท.ด.": [{ORTH: "วท.ด."}],
"ศ.บ.": [{ORTH: "ศ.บ.", LEMMA: "ศิลปบัณฑิต"}], "ศ.บ.": [{ORTH: "ศ.บ."}],
"ศศ.บ.": [{ORTH: "ศศ.บ.", LEMMA: "ศิลปศาสตรบัณฑิต"}], "ศศ.บ.": [{ORTH: "ศศ.บ."}],
"ศษ.บ.": [{ORTH: "ศษ.บ.", LEMMA: "ศึกษาศาสตรบัณฑิต"}], "ศษ.บ.": [{ORTH: "ศษ.บ."}],
"ศส.บ.": [{ORTH: "ศส.บ.", LEMMA: "เศรษฐศาสตรบัณฑิต"}], "ศส.บ.": [{ORTH: "ศส.บ."}],
"สถ.บ.": [{ORTH: "สถ.บ.", LEMMA: "สถาปัตยกรรมศาสตรบัณฑิต"}], "สถ.บ.": [{ORTH: "สถ.บ."}],
"สถ.ม.": [{ORTH: "สถ.ม.", LEMMA: "สถาปัตยกรรมศาสตรมหาบัณฑิต"}], "สถ.ม.": [{ORTH: "สถ.ม."}],
"สถ.ด.": [{ORTH: "สถ.ด.", LEMMA: "สถาปัตยกรรมศาสตรดุษฎีบัณฑิต"}], "สถ.ด.": [{ORTH: "สถ.ด."}],
"สพ.บ.": [{ORTH: "สพ.บ.", LEMMA: "สัตวแพทยศาสตรบัณฑิต"}], "สพ.บ.": [{ORTH: "สพ.บ."}],
"อ.บ.": [{ORTH: "อ.บ.", LEMMA: "อักษรศาสตรบัณฑิต"}], "อ.บ.": [{ORTH: "อ.บ."}],
"อ.ม.": [{ORTH: "อ.ม.", LEMMA: "อักษรศาสตรมหาบัณฑิต"}], "อ.ม.": [{ORTH: "อ.ม."}],
"อ.ด.": [{ORTH: "อ.ด.", LEMMA: "อักษรศาสตรดุษฎีบัณฑิต"}], "อ.ด.": [{ORTH: "อ.ด."}],
# ปี / เวลา / year / time # ปี / เวลา / year / time
"ชม.": [{ORTH: "ชม.", LEMMA: "ชั่วโมง"}], "ชม.": [{ORTH: "ชม."}],
"จ.ศ.": [{ORTH: "จ.ศ.", LEMMA: "จุลศักราช"}], "จ.ศ.": [{ORTH: "จ.ศ."}],
"ค.ศ.": [{ORTH: "ค.ศ.", LEMMA: "คริสต์ศักราช"}], "ค.ศ.": [{ORTH: "ค.ศ."}],
"ฮ.ศ.": [{ORTH: "ฮ.ศ.", LEMMA: "ฮิจเราะห์ศักราช"}], "ฮ.ศ.": [{ORTH: "ฮ.ศ."}],
"ว.ด.ป.": [{ORTH: "ว.ด.ป.", LEMMA: "วัน เดือน ปี"}], "ว.ด.ป.": [{ORTH: "ว.ด.ป."}],
# ระยะทาง / distance # ระยะทาง / distance
"ฮม.": [{ORTH: "ฮม.", LEMMA: "เฮกโตเมตร"}], "ฮม.": [{ORTH: "ฮม."}],
"ดคม.": [{ORTH: "ดคม.", LEMMA: "เดคาเมตร"}], "ดคม.": [{ORTH: "ดคม."}],
"ดม.": [{ORTH: "ดม.", LEMMA: "เดซิเมตร"}], "ดม.": [{ORTH: "ดม."}],
"มม.": [{ORTH: "มม.", LEMMA: "มิลลิเมตร"}], "มม.": [{ORTH: "มม."}],
"ซม.": [{ORTH: "ซม.", LEMMA: "เซนติเมตร"}], "ซม.": [{ORTH: "ซม."}],
"กม.": [{ORTH: "กม.", LEMMA: "กิโลเมตร"}], "กม.": [{ORTH: "กม."}],
# น้ำหนัก / weight # น้ำหนัก / weight
"น.น.": [{ORTH: "น.น.", LEMMA: "น้ำหนัก"}], "น.น.": [{ORTH: "น.น."}],
"ฮก.": [{ORTH: "ฮก.", LEMMA: "เฮกโตกรัม"}], "ฮก.": [{ORTH: "ฮก."}],
"ดคก.": [{ORTH: "ดคก.", LEMMA: "เดคากรัม"}], "ดคก.": [{ORTH: "ดคก."}],
"ดก.": [{ORTH: "ดก.", LEMMA: "เดซิกรัม"}], "ดก.": [{ORTH: "ดก."}],
"ซก.": [{ORTH: "ซก.", LEMMA: "เซนติกรัม"}], "ซก.": [{ORTH: "ซก."}],
"มก.": [{ORTH: "มก.", LEMMA: "มิลลิกรัม"}], "มก.": [{ORTH: "มก."}],
"ก.": [{ORTH: "ก.", LEMMA: "กรัม"}], "ก.": [{ORTH: "ก."}],
"กก.": [{ORTH: "กก.", LEMMA: "กิโลกรัม"}], "กก.": [{ORTH: "กก."}],
# ปริมาตร / volume # ปริมาตร / volume
"ฮล.": [{ORTH: "ฮล.", LEMMA: "เฮกโตลิตร"}], "ฮล.": [{ORTH: "ฮล."}],
"ดคล.": [{ORTH: "ดคล.", LEMMA: "เดคาลิตร"}], "ดคล.": [{ORTH: "ดคล."}],
"ดล.": [{ORTH: "ดล.", LEMMA: "เดซิลิตร"}], "ดล.": [{ORTH: "ดล."}],
"ซล.": [{ORTH: "ซล.", LEMMA: "เซนติลิตร"}], "ซล.": [{ORTH: "ซล."}],
"ล.": [{ORTH: "ล.", LEMMA: "ลิตร"}], "ล.": [{ORTH: "ล."}],
"กล.": [{ORTH: "กล.", LEMMA: "กิโลลิตร"}], "กล.": [{ORTH: "กล."}],
"ลบ.": [{ORTH: "ลบ.", LEMMA: "ลูกบาศก์"}], "ลบ.": [{ORTH: "ลบ."}],
# พื้นที่ / area # พื้นที่ / area
"ตร.ซม.": [{ORTH: "ตร.ซม.", LEMMA: "ตารางเซนติเมตร"}], "ตร.ซม.": [{ORTH: "ตร.ซม."}],
"ตร.ม.": [{ORTH: "ตร.ม.", LEMMA: "ตารางเมตร"}], "ตร.ม.": [{ORTH: "ตร.ม."}],
"ตร.ว.": [{ORTH: "ตร.ว.", LEMMA: "ตารางวา"}], "ตร.ว.": [{ORTH: "ตร.ว."}],
"ตร.กม.": [{ORTH: "ตร.กม.", LEMMA: "ตารางกิโลเมตร"}], "ตร.กม.": [{ORTH: "ตร.กม."}],
# เดือน / month # เดือน / month
"ม.ค.": [{ORTH: "ม.ค.", LEMMA: "มกราคม"}], "ม.ค.": [{ORTH: "ม.ค."}],
"ก.พ.": [{ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"}], "ก.พ.": [{ORTH: "ก.พ."}],
"มี.ค.": [{ORTH: "มี.ค.", LEMMA: "มีนาคม"}], "มี.ค.": [{ORTH: "มี.ค."}],
"เม.ย.": [{ORTH: "เม.ย.", LEMMA: "เมษายน"}], "เม.ย.": [{ORTH: "เม.ย."}],
"พ.ค.": [{ORTH: "พ.ค.", LEMMA: "พฤษภาคม"}], "พ.ค.": [{ORTH: "พ.ค."}],
"มิ.ย.": [{ORTH: "มิ.ย.", LEMMA: "มิถุนายน"}], "มิ.ย.": [{ORTH: "มิ.ย."}],
"ก.ค.": [{ORTH: "ก.ค.", LEMMA: "กรกฎาคม"}], "ก.ค.": [{ORTH: "ก.ค."}],
"ส.ค.": [{ORTH: "ส.ค.", LEMMA: "สิงหาคม"}], "ส.ค.": [{ORTH: "ส.ค."}],
"ก.ย.": [{ORTH: "ก.ย.", LEMMA: "กันยายน"}], "ก.ย.": [{ORTH: "ก.ย."}],
"ต.ค.": [{ORTH: "ต.ค.", LEMMA: "ตุลาคม"}], "ต.ค.": [{ORTH: "ต.ค."}],
"พ.ย.": [{ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"}], "พ.ย.": [{ORTH: "พ.ย."}],
"ธ.ค.": [{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}], "ธ.ค.": [{ORTH: "ธ.ค."}],
# เพศ / gender # เพศ / gender
"ช.": [{ORTH: "ช.", LEMMA: "ชาย"}], "ช.": [{ORTH: "ช."}],
"ญ.": [{ORTH: "ญ.", LEMMA: "หญิง"}], "ญ.": [{ORTH: "ญ."}],
"ด.ช.": [{ORTH: "ด.ช.", LEMMA: "เด็กชาย"}], "ด.ช.": [{ORTH: "ด.ช."}],
"ด.ญ.": [{ORTH: "ด.ญ.", LEMMA: "เด็กหญิง"}], "ด.ญ.": [{ORTH: "ด.ญ."}],
# ที่อยู่ / address # ที่อยู่ / address
"ถ.": [{ORTH: "ถ.", LEMMA: "ถนน"}], "ถ.": [{ORTH: "ถ."}],
"ต.": [{ORTH: "ต.", LEMMA: "ตำบล"}], "ต.": [{ORTH: "ต."}],
"อ.": [{ORTH: "อ.", LEMMA: "อำเภอ"}], "อ.": [{ORTH: "อ."}],
"จ.": [{ORTH: "จ.", LEMMA: "จังหวัด"}], "จ.": [{ORTH: "จ."}],
# สรรพนาม / pronoun # สรรพนาม / pronoun
"ข้าฯ": [{ORTH: "ข้าฯ", LEMMA: "ข้าพระพุทธเจ้า"}], "ข้าฯ": [{ORTH: "ข้าฯ"}],
"ทูลเกล้าฯ": [{ORTH: "ทูลเกล้าฯ", LEMMA: "ทูลเกล้าทูลกระหม่อม"}], "ทูลเกล้าฯ": [{ORTH: "ทูลเกล้าฯ"}],
"น้อมเกล้าฯ": [{ORTH: "น้อมเกล้าฯ", LEMMA: "น้อมเกล้าน้อมกระหม่อม"}], "น้อมเกล้าฯ": [{ORTH: "น้อมเกล้าฯ"}],
"โปรดเกล้าฯ": [{ORTH: "โปรดเกล้าฯ", LEMMA: "โปรดเกล้าโปรดกระหม่อม"}], "โปรดเกล้าฯ": [{ORTH: "โปรดเกล้าฯ"}],
# การเมือง / politic # การเมือง / politic
"ขจก.": [{ORTH: "ขจก.", LEMMA: "ขบวนการโจรก่อการร้าย"}], "ขจก.": [{ORTH: "ขจก."}],
"ขบด.": [{ORTH: "ขบด.", LEMMA: "ขบวนการแบ่งแยกดินแดน"}], "ขบด.": [{ORTH: "ขบด."}],
"นปช.": [{ORTH: "นปช.", LEMMA: "แนวร่วมประชาธิปไตยขับไล่เผด็จการ"}], "นปช.": [{ORTH: "นปช."}],
"ปชป.": [{ORTH: "ปชป.", LEMMA: "พรรคประชาธิปัตย์"}], "ปชป.": [{ORTH: "ปชป."}],
"ผกค.": [{ORTH: "ผกค.", LEMMA: "ผู้ก่อการร้ายคอมมิวนิสต์"}], "ผกค.": [{ORTH: "ผกค."}],
"พท.": [{ORTH: "พท.", LEMMA: "พรรคเพื่อไทย"}], "พท.": [{ORTH: "พท."}],
"พ.ร.ก.": [{ORTH: "พ.ร.ก.", LEMMA: "พระราชกำหนด"}], "พ.ร.ก.": [{ORTH: "พ.ร.ก."}],
"พ.ร.ฎ.": [{ORTH: "พ.ร.ฎ.", LEMMA: "พระราชกฤษฎีกา"}], "พ.ร.ฎ.": [{ORTH: "พ.ร.ฎ."}],
"พ.ร.บ.": [{ORTH: "พ.ร.บ.", LEMMA: "พระราชบัญญัติ"}], "พ.ร.บ.": [{ORTH: "พ.ร.บ."}],
"รธน.": [{ORTH: "รธน.", LEMMA: "รัฐธรรมนูญ"}], "รธน.": [{ORTH: "รธน."}],
"รบ.": [{ORTH: "รบ.", LEMMA: "รัฐบาล"}], "รบ.": [{ORTH: "รบ."}],
"รสช.": [{ORTH: "รสช.", LEMMA: "คณะรักษาความสงบเรียบร้อยแห่งชาติ"}], "รสช.": [{ORTH: "รสช."}],
"ส.ก.": [{ORTH: "ส.ก.", LEMMA: "สมาชิกสภากรุงเทพมหานคร"}], "ส.ก.": [{ORTH: "ส.ก."}],
"สจ.": [{ORTH: "สจ.", LEMMA: "สมาชิกสภาจังหวัด"}], "สจ.": [{ORTH: "สจ."}],
"สว.": [{ORTH: "สว.", LEMMA: "สมาชิกวุฒิสภา"}], "สว.": [{ORTH: "สว."}],
"ส.ส.": [{ORTH: "ส.ส.", LEMMA: "สมาชิกสภาผู้แทนราษฎร"}], "ส.ส.": [{ORTH: "ส.ส."}],
# ทั่วไป / general # ทั่วไป / general
"ก.ข.ค.": [{ORTH: "ก.ข.ค.", LEMMA: "ก้างขวางคอ"}], "ก.ข.ค.": [{ORTH: "ก.ข.ค."}],
"กทม.": [{ORTH: "กทม.", LEMMA: "กรุงเทพมหานคร"}], "กทม.": [{ORTH: "กทม."}],
"กรุงเทพฯ": [{ORTH: "กรุงเทพฯ", LEMMA: "กรุงเทพมหานคร"}], "กรุงเทพฯ": [{ORTH: "กรุงเทพฯ"}],
"ขรก.": [{ORTH: "ขรก.", LEMMA: "ข้าราชการ"}], "ขรก.": [{ORTH: "ขรก."}],
"ขส": [{ORTH: "ขส.", LEMMA: "ขนส่ง"}], "ขส": [{ORTH: "ขส."}],
"ค.ร.น.": [{ORTH: "ค.ร.น.", LEMMA: "คูณร่วมน้อย"}], "ค.ร.น.": [{ORTH: "ค.ร.น."}],
"ค.ร.ม.": [{ORTH: "ค.ร.ม.", LEMMA: "คูณร่วมมาก"}], "ค.ร.ม.": [{ORTH: "ค.ร.ม."}],
"ง.ด.": [{ORTH: "ง.ด.", LEMMA: "เงินเดือน"}], "ง.ด.": [{ORTH: "ง.ด."}],
"งป.": [{ORTH: "งป.", LEMMA: "งบประมาณ"}], "งป.": [{ORTH: "งป."}],
"จก.": [{ORTH: "จก.", LEMMA: "จำกัด"}], "จก.": [{ORTH: "จก."}],
"จขกท.": [{ORTH: "จขกท.", LEMMA: "เจ้าของกระทู้"}], "จขกท.": [{ORTH: "จขกท."}],
"จนท.": [{ORTH: "จนท.", LEMMA: "เจ้าหน้าที่"}], "จนท.": [{ORTH: "จนท."}],
"จ.ป.ร.": [ "จ.ป.ร.": [{ORTH: "จ.ป.ร."}],
{ "จ.ม.": [{ORTH: "จ.ม."}],
ORTH: "จ.ป.ร.", "จย.": [{ORTH: "จย."}],
LEMMA: "มหาจุฬาลงกรณ ปรมราชาธิราช (พระปรมาภิไธยในพระบาทสมเด็จพระจุลจอมเกล้าเจ้าอยู่หัว)", "จยย.": [{ORTH: "จยย."}],
} "ตจว.": [{ORTH: "ตจว."}],
], "โทร.": [{ORTH: "โทร."}],
"จ.ม.": [{ORTH: "จ.ม.", LEMMA: "จดหมาย"}], "ธ.": [{ORTH: "ธ."}],
"จย.": [{ORTH: "จย.", LEMMA: "จักรยาน"}], "น.ร.": [{ORTH: "น.ร."}],
"จยย.": [{ORTH: "จยย.", LEMMA: "จักรยานยนต์"}], "น.ศ.": [{ORTH: "น.ศ."}],
"ตจว.": [{ORTH: "ตจว.", LEMMA: "ต่างจังหวัด"}], "น.ส.": [{ORTH: "น.ส."}],
"โทร.": [{ORTH: "โทร.", LEMMA: "โทรศัพท์"}], "น.ส.๓": [{ORTH: "น.ส.๓"}],
"ธ.": [{ORTH: "ธ.", LEMMA: "ธนาคาร"}], "น.ส.๓ ก.": [{ORTH: "น.ส.๓ ก"}],
"น.ร.": [{ORTH: "น.ร.", LEMMA: "นักเรียน"}], "นสพ.": [{ORTH: "นสพ."}],
"น.ศ.": [{ORTH: "น.ศ.", LEMMA: "นักศึกษา"}], "บ.ก.": [{ORTH: "บ.ก."}],
"น.ส.": [{ORTH: "น.ส.", LEMMA: "นางสาว"}], "บจก.": [{ORTH: "บจก."}],
"น.ส.๓": [{ORTH: "น.ส.๓", LEMMA: "หนังสือรับรองการทำประโยชน์ในที่ดิน"}], "บงล.": [{ORTH: "บงล."}],
"น.ส.๓ ก.": [ "บบส.": [{ORTH: "บบส."}],
{ORTH: "น.ส.๓ ก", LEMMA: "หนังสือแสดงกรรมสิทธิ์ในที่ดิน (มีระวางกำหนด)"} "บมจ.": [{ORTH: "บมจ."}],
], "บลจ.": [{ORTH: "บลจ."}],
"นสพ.": [{ORTH: "นสพ.", LEMMA: "หนังสือพิมพ์"}], "บ/ช": [{ORTH: "บ/ช"}],
"บ.ก.": [{ORTH: "บ.ก.", LEMMA: "บรรณาธิการ"}], "บร.": [{ORTH: "บร."}],
"บจก.": [{ORTH: "บจก.", LEMMA: "บริษัทจำกัด"}], "ปชช.": [{ORTH: "ปชช."}],
"บงล.": [{ORTH: "บงล.", LEMMA: "บริษัทเงินทุนและหลักทรัพย์จำกัด"}], "ปณ.": [{ORTH: "ปณ."}],
"บบส.": [{ORTH: "บบส.", LEMMA: "บรรษัทบริหารสินทรัพย์สถาบันการเงิน"}], "ปณก.": [{ORTH: "ปณก."}],
"บมจ.": [{ORTH: "บมจ.", LEMMA: "บริษัทมหาชนจำกัด"}], "ปณส.": [{ORTH: "ปณส."}],
"บลจ.": [{ORTH: "บลจ.", LEMMA: "บริษัทหลักทรัพย์จัดการกองทุนรวมจำกัด"}], "ปธ.": [{ORTH: "ปธ."}],
"บ/ช": [{ORTH: "บ/ช", LEMMA: "บัญชี"}], "ปธน.": [{ORTH: "ปธน."}],
"บร.": [{ORTH: "บร.", LEMMA: "บรรณารักษ์"}], "ปอ.": [{ORTH: "ปอ."}],
"ปชช.": [{ORTH: "ปชช.", LEMMA: "ประชาชน"}], "ปอ.พ.": [{ORTH: "ปอ.พ."}],
"ปณ.": [{ORTH: "ปณ.", LEMMA: "ที่ทำการไปรษณีย์"}], "พ.ก.ง.": [{ORTH: "พ.ก.ง."}],
"ปณก.": [{ORTH: "ปณก.", LEMMA: "ที่ทำการไปรษณีย์กลาง"}], "พ.ก.ส.": [{ORTH: "พ.ก.ส."}],
"ปณส.": [{ORTH: "ปณส.", LEMMA: "ที่ทำการไปรษณีย์สาขา"}], "พขร.": [{ORTH: "พขร."}],
"ปธ.": [{ORTH: "ปธ.", LEMMA: "ประธาน"}], "ภ.ง.ด.": [{ORTH: "ภ.ง.ด."}],
"ปธน.": [{ORTH: "ปธน.", LEMMA: "ประธานาธิบดี"}], "ภ.ง.ด.๙": [{ORTH: "ภ.ง.ด.๙"}],
"ปอ.": [{ORTH: "ปอ.", LEMMA: "รถยนต์โดยสารประจำทางปรับอากาศ"}], "ภ.ป.ร.": [{ORTH: "ภ.ป.ร."}],
"ปอ.พ.": [{ORTH: "ปอ.พ.", LEMMA: "รถยนต์โดยสารประจำทางปรับอากาศพิเศษ"}], "ภ.พ.": [{ORTH: "ภ.พ."}],
"พ.ก.ง.": [{ORTH: "พ.ก.ง.", LEMMA: "พัสดุเก็บเงินปลายทาง"}], "ร.": [{ORTH: "ร."}],
"พ.ก.ส.": [{ORTH: "พ.ก.ส.", LEMMA: "พนักงานเก็บค่าโดยสาร"}], "ร.ง.": [{ORTH: "ร.ง."}],
"พขร.": [{ORTH: "พขร.", LEMMA: "พนักงานขับรถ"}], "ร.ด.": [{ORTH: "ร.ด."}],
"ภ.ง.ด.": [{ORTH: "ภ.ง.ด.", LEMMA: "ภาษีเงินได้"}], "รปภ.": [{ORTH: "รปภ."}],
"ภ.ง.ด.๙": [{ORTH: "ภ.ง.ด.๙", LEMMA: "แบบแสดงรายการเสียภาษีเงินได้ของกรมสรรพากร"}], "รพ.": [{ORTH: "รพ."}],
"ภ.ป.ร.": [ "ร.พ.": [{ORTH: "ร.พ."}],
{ "รร.": [{ORTH: "รร."}],
ORTH: "ภ.ป.ร.", "รสก.": [{ORTH: "รสก."}],
LEMMA: "ภูมิพลอดุยเดช ปรมราชาธิราช (พระปรมาภิไธยในพระบาทสมเด็จพระปรมินทรมหาภูมิพลอดุลยเดช)", "ส.ค.ส.": [{ORTH: "ส.ค.ส."}],
} "สต.": [{ORTH: "สต."}],
], "สน.": [{ORTH: "สน."}],
"ภ.พ.": [{ORTH: "ภ.พ.", LEMMA: "ภาษีมูลค่าเพิ่ม"}], "สนข.": [{ORTH: "สนข."}],
"ร.": [{ORTH: "ร.", LEMMA: "รัชกาล"}], "สนง.": [{ORTH: "สนง."}],
"ร.ง.": [{ORTH: "ร.ง.", LEMMA: "โรงงาน"}], "สนญ.": [{ORTH: "สนญ."}],
"ร.ด.": [{ORTH: "ร.ด.", LEMMA: "รักษาดินแดน"}], "ส.ป.ช.": [{ORTH: "ส.ป.ช."}],
"รปภ.": [{ORTH: "รปภ.", LEMMA: "รักษาความปลอดภัย"}], "สภ.": [{ORTH: "สภ."}],
"รพ.": [{ORTH: "รพ.", LEMMA: "โรงพยาบาล"}], "ส.ล.น.": [{ORTH: "ส.ล.น."}],
"ร.พ.": [{ORTH: "ร.พ.", LEMMA: "โรงพิมพ์"}], "สวญ.": [{ORTH: "สวญ."}],
"รร.": [{ORTH: "รร.", LEMMA: "โรงเรียน,โรงแรม"}], "สวป.": [{ORTH: "สวป."}],
"รสก.": [{ORTH: "รสก.", LEMMA: "รัฐวิสาหกิจ"}], "สว.สส.": [{ORTH: "สว.สส."}],
"ส.ค.ส.": [{ORTH: "ส.ค.ส.", LEMMA: "ส่งความสุขปีใหม่"}], "ส.ห.": [{ORTH: "ส.ห."}],
"สต.": [{ORTH: "สต.", LEMMA: "สตางค์"}], "สอ.": [{ORTH: "สอ."}],
"สน.": [{ORTH: "สน.", LEMMA: "สถานีตำรวจ"}], "สอท.": [{ORTH: "สอท."}],
"สนข.": [{ORTH: "สนข.", LEMMA: "สำนักงานเขต"}], "เสธ.": [{ORTH: "เสธ."}],
"สนง.": [{ORTH: "สนง.", LEMMA: "สำนักงาน"}], "หจก.": [{ORTH: "หจก."}],
"สนญ.": [{ORTH: "สนญ.", LEMMA: "สำนักงานใหญ่"}], "ห.ร.ม.": [{ORTH: "ห.ร.ม."}],
"ส.ป.ช.": [{ORTH: "ส.ป.ช.", LEMMA: "สร้างเสริมประสบการณ์ชีวิต"}],
"สภ.": [{ORTH: "สภ.", LEMMA: "สถานีตำรวจภูธร"}],
"ส.ล.น.": [{ORTH: "ส.ล.น.", LEMMA: "สร้างเสริมลักษณะนิสัย"}],
"สวญ.": [{ORTH: "สวญ.", LEMMA: "สารวัตรใหญ่"}],
"สวป.": [{ORTH: "สวป.", LEMMA: "สารวัตรป้องกันปราบปราม"}],
"สว.สส.": [{ORTH: "สว.สส.", LEMMA: "สารวัตรสืบสวน"}],
"ส.ห.": [{ORTH: "ส.ห.", LEMMA: "สารวัตรทหาร"}],
"สอ.": [{ORTH: "สอ.", LEMMA: "สถานีอนามัย"}],
"สอท.": [{ORTH: "สอท.", LEMMA: "สถานเอกอัครราชทูต"}],
"เสธ.": [{ORTH: "เสธ.", LEMMA: "เสนาธิการ"}],
"หจก.": [{ORTH: "หจก.", LEMMA: "ห้างหุ้นส่วนจำกัด"}],
"ห.ร.ม.": [{ORTH: "ห.ร.ม.", LEMMA: "ตัวหารร่วมมาก"}],
} }

Some files were not shown because too many files have changed in this diff Show More