mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Refactor pipeline components, config and language data (#5759)
* Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
311d0bde29
commit
43b960c01b
|
@ -17,7 +17,6 @@ import plac
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.kb import KnowledgeBase
|
|
||||||
|
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
from spacy.pipeline import EntityRuler
|
from spacy.pipeline import EntityRuler
|
||||||
|
@ -82,12 +81,16 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
|
||||||
|
|
||||||
# Create the Entity Linker component and add it to the pipeline.
|
# Create the Entity Linker component and add it to the pipeline.
|
||||||
if "entity_linker" not in nlp.pipe_names:
|
if "entity_linker" not in nlp.pipe_names:
|
||||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
print("Loading Knowledge Base from '%s'" % kb_path)
|
||||||
kb.load_bulk(kb_path)
|
cfg = {
|
||||||
print("Loaded Knowledge Base from '%s'" % kb_path)
|
"kb": {
|
||||||
|
"@assets": "spacy.KBFromFile.v1",
|
||||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
"vocab_path": vocab_path,
|
||||||
cfg = {"kb": kb, "incl_prior": False}
|
"kb_path": kb_path,
|
||||||
|
},
|
||||||
|
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||||
|
"incl_prior": False,
|
||||||
|
}
|
||||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||||
nlp.add_pipe(entity_linker, last=True)
|
nlp.add_pipe(entity_linker, last=True)
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a18,<8.0.0a20",
|
"thinc>=8.0.0a19,<8.0.0a30",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations"
|
"pytokenizations"
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a18,<8.0.0a20
|
thinc>=8.0.0a19,<8.0.0a30
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets>=0.1.1
|
ml_datasets>=0.1.1
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.7.0,<1.1.0
|
wasabi>=0.7.1,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
catalogue>=0.0.7,<1.1.0
|
catalogue>=0.0.7,<1.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
|
|
|
@ -34,15 +34,15 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a18,<8.0.0a20
|
thinc>=8.0.0a19,<8.0.0a30
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a18,<8.0.0a20
|
thinc>=8.0.0a19,<8.0.0a30
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.7.0,<1.1.0
|
wasabi>=0.7.1,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
catalogue>=0.0.7,<1.1.0
|
catalogue>=0.0.7,<1.1.0
|
||||||
typer>=0.3.0,<0.4.0
|
typer>=0.3.0,<0.4.0
|
||||||
|
|
8
setup.py
8
setup.py
|
@ -32,8 +32,14 @@ MOD_NAMES = [
|
||||||
"spacy.attrs",
|
"spacy.attrs",
|
||||||
"spacy.kb",
|
"spacy.kb",
|
||||||
"spacy.morphology",
|
"spacy.morphology",
|
||||||
"spacy.pipeline.pipes",
|
"spacy.pipeline.dep_parser",
|
||||||
"spacy.pipeline.morphologizer",
|
"spacy.pipeline.morphologizer",
|
||||||
|
"spacy.pipeline.multitask",
|
||||||
|
"spacy.pipeline.ner",
|
||||||
|
"spacy.pipeline.pipe",
|
||||||
|
"spacy.pipeline.sentencizer",
|
||||||
|
"spacy.pipeline.senter",
|
||||||
|
"spacy.pipeline.tagger",
|
||||||
"spacy.syntax.stateclass",
|
"spacy.syntax.stateclass",
|
||||||
"spacy.syntax._state",
|
"spacy.syntax._state",
|
||||||
"spacy.tokenizer",
|
"spacy.tokenizer",
|
||||||
|
|
|
@ -14,7 +14,6 @@ from .about import __version__
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from . import util
|
from . import util
|
||||||
from .util import registry
|
from .util import registry
|
||||||
from .language import component
|
|
||||||
|
|
||||||
|
|
||||||
if sys.maxunicode == 65535:
|
if sys.maxunicode == 65535:
|
||||||
|
|
|
@ -63,7 +63,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
||||||
result = {}
|
result = {}
|
||||||
while args:
|
while args:
|
||||||
opt = args.pop(0)
|
opt = args.pop(0)
|
||||||
err = f"Invalid config override '{opt}'"
|
err = f"Invalid CLI argument '{opt}'"
|
||||||
if opt.startswith("--"): # new argument
|
if opt.startswith("--"): # new argument
|
||||||
opt = opt.replace("--", "").replace("-", "_")
|
opt = opt.replace("--", "").replace("-", "_")
|
||||||
if "." not in opt:
|
if "." not in opt:
|
||||||
|
@ -73,7 +73,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
||||||
else:
|
else:
|
||||||
value = args.pop(0)
|
value = args.pop(0)
|
||||||
# Just like we do in the config, we're calling json.loads on the
|
# Just like we do in the config, we're calling json.loads on the
|
||||||
# values. But since they come from the CLI, it'd b unintuitive to
|
# values. But since they come from the CLI, it'd be unintuitive to
|
||||||
# explicitly mark strings with escaped quotes. So we're working
|
# explicitly mark strings with escaped quotes. So we're working
|
||||||
# around that here by falling back to a string if parsing fails.
|
# around that here by falling back to a string if parsing fails.
|
||||||
# TODO: improve logic to handle simple types like list of strings?
|
# TODO: improve logic to handle simple types like list of strings?
|
||||||
|
@ -82,7 +82,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
||||||
except ValueError:
|
except ValueError:
|
||||||
result[opt] = str(value)
|
result[opt] = str(value)
|
||||||
else:
|
else:
|
||||||
msg.fail(f"{err}: options need to start with --", exits=1)
|
msg.fail(f"{err}: override option should start with --", exits=1)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,12 +3,12 @@ from pathlib import Path
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import sys
|
import sys
|
||||||
import srsly
|
import srsly
|
||||||
from wasabi import Printer, MESSAGES, msg
|
from wasabi import Printer, MESSAGES, msg, diff_strings
|
||||||
import typer
|
import typer
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli
|
from ._util import import_code, debug_cli
|
||||||
from ..schemas import ConfigSchema
|
|
||||||
from ..gold import Corpus, Example
|
from ..gold import Corpus, Example
|
||||||
from ..syntax import nonproj
|
from ..syntax import nonproj
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -33,6 +33,9 @@ def debug_config_cli(
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
|
output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
|
||||||
|
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
|
||||||
|
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Debug a config.cfg file and show validation errors. The command will
|
"""Debug a config.cfg file and show validation errors. The command will
|
||||||
|
@ -40,14 +43,37 @@ def debug_config_cli(
|
||||||
validation errors are blocking and will prevent the rest of the config from
|
validation errors are blocking and will prevent the rest of the config from
|
||||||
being resolved. This means that you may not see all validation errors at
|
being resolved. This means that you may not see all validation errors at
|
||||||
once and some issues are only shown once previous errors have been fixed.
|
once and some issues are only shown once previous errors have been fixed.
|
||||||
|
Similar as with the 'train' command, you can override settings from the config
|
||||||
|
as command line options. For instance, --training.batch_size 128 overrides
|
||||||
|
the value of "batch_size" in the block "[training]".
|
||||||
"""
|
"""
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
with show_validation_error():
|
with show_validation_error():
|
||||||
util.load_config(
|
config = Config().from_disk(config_path)
|
||||||
config_path, create_objects=False, schema=ConfigSchema, overrides=overrides,
|
try:
|
||||||
)
|
nlp, _ = util.load_model_from_config(
|
||||||
msg.good("Config is valid")
|
config, overrides=overrides, auto_fill=auto_fill
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
msg.fail(str(e), exits=1)
|
||||||
|
is_stdout = output_path is not None and str(output_path) == "-"
|
||||||
|
if auto_fill:
|
||||||
|
orig_config = config.to_str()
|
||||||
|
filled_config = nlp.config.to_str()
|
||||||
|
if orig_config == filled_config:
|
||||||
|
msg.good("Original config is valid, no values were auto-filled")
|
||||||
|
else:
|
||||||
|
msg.good("Auto-filled config is valid")
|
||||||
|
if diff:
|
||||||
|
print(diff_strings(config.to_str(), nlp.config.to_str()))
|
||||||
|
else:
|
||||||
|
msg.good("Original config is valid", show=not is_stdout)
|
||||||
|
if is_stdout:
|
||||||
|
print(nlp.config.to_str())
|
||||||
|
elif output_path is not None:
|
||||||
|
nlp.config.to_disk(output_path)
|
||||||
|
msg.good(f"Saved updated config to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
@ -117,16 +143,13 @@ def debug_data(
|
||||||
if not config_path.exists():
|
if not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exists=1)
|
msg.fail("Config file not found", config_path, exists=1)
|
||||||
with show_validation_error():
|
with show_validation_error():
|
||||||
config = util.load_config(
|
cfg = Config().from_disk(config_path)
|
||||||
config_path,
|
nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
|
||||||
create_objects=False,
|
# TODO: handle base model
|
||||||
schema=ConfigSchema,
|
|
||||||
overrides=config_overrides,
|
|
||||||
)
|
|
||||||
nlp = util.load_model_from_config(config["nlp"])
|
|
||||||
lang = config["nlp"]["lang"]
|
lang = config["nlp"]["lang"]
|
||||||
base_model = config["nlp"]["base_model"]
|
base_model = config["training"]["base_model"]
|
||||||
pipeline = list(config["nlp"]["pipeline"].keys())
|
pipeline = nlp.pipe_names
|
||||||
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
tag_map_path = util.ensure_path(config["training"]["tag_map"])
|
tag_map_path = util.ensure_path(config["training"]["tag_map"])
|
||||||
tag_map = {}
|
tag_map = {}
|
||||||
if tag_map_path is not None:
|
if tag_map_path is not None:
|
||||||
|
@ -164,19 +187,17 @@ def debug_data(
|
||||||
msg.good("Corpus is loadable")
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||||
gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
|
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(
|
gold_train_unpreprocessed_data = _compile_gold(
|
||||||
train_dataset, pipeline, nlp, make_proj=False
|
train_dataset, factory_names, nlp, make_proj=False
|
||||||
)
|
)
|
||||||
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
|
gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
|
|
||||||
msg.divider("Training stats")
|
msg.divider("Training stats")
|
||||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||||
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
|
||||||
msg.fail(f"Pipeline component '{pipe}' not available in factories")
|
|
||||||
if base_model:
|
if base_model:
|
||||||
msg.text(f"Starting with base model '{base_model}'")
|
msg.text(f"Starting with base model '{base_model}'")
|
||||||
else:
|
else:
|
||||||
|
@ -244,7 +265,7 @@ def debug_data(
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the model")
|
msg.info("No word vectors present in the model")
|
||||||
|
|
||||||
if "ner" in pipeline:
|
if "ner" in factory_names:
|
||||||
# Get all unique NER labels present in the data
|
# Get all unique NER labels present in the data
|
||||||
labels = set(
|
labels = set(
|
||||||
label for label in gold_train_data["ner"] if label not in ("O", "-", None)
|
label for label in gold_train_data["ner"] if label not in ("O", "-", None)
|
||||||
|
@ -332,7 +353,7 @@ def debug_data(
|
||||||
"with punctuation can not be trained with a noise level > 0."
|
"with punctuation can not be trained with a noise level > 0."
|
||||||
)
|
)
|
||||||
|
|
||||||
if "textcat" in pipeline:
|
if "textcat" in factory_names:
|
||||||
msg.divider("Text Classification")
|
msg.divider("Text Classification")
|
||||||
labels = [label for label in gold_train_data["cats"]]
|
labels = [label for label in gold_train_data["cats"]]
|
||||||
model_labels = _get_labels_from_model(nlp, "textcat")
|
model_labels = _get_labels_from_model(nlp, "textcat")
|
||||||
|
@ -379,7 +400,7 @@ def debug_data(
|
||||||
"contains only instances with mutually-exclusive classes."
|
"contains only instances with mutually-exclusive classes."
|
||||||
)
|
)
|
||||||
|
|
||||||
if "tagger" in pipeline:
|
if "tagger" in factory_names:
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
labels = [label for label in gold_train_data["tags"]]
|
labels = [label for label in gold_train_data["tags"]]
|
||||||
tag_map = nlp.vocab.morphology.tag_map
|
tag_map = nlp.vocab.morphology.tag_map
|
||||||
|
@ -394,7 +415,7 @@ def debug_data(
|
||||||
for label in non_tagmap:
|
for label in non_tagmap:
|
||||||
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
||||||
|
|
||||||
if "parser" in pipeline:
|
if "parser" in factory_names:
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
msg.divider("Dependency Parsing")
|
msg.divider("Dependency Parsing")
|
||||||
|
|
||||||
|
@ -541,7 +562,10 @@ def _load_file(file_path: Path, msg: Printer) -> None:
|
||||||
|
|
||||||
|
|
||||||
def _compile_gold(
|
def _compile_gold(
|
||||||
examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
|
examples: Sequence[Example],
|
||||||
|
factory_names: List[str],
|
||||||
|
nlp: Language,
|
||||||
|
make_proj: bool,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
data = {
|
data = {
|
||||||
"ner": Counter(),
|
"ner": Counter(),
|
||||||
|
@ -573,7 +597,7 @@ def _compile_gold(
|
||||||
for word in valid_words:
|
for word in valid_words:
|
||||||
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||||
data["words_missing_vectors"].update([word])
|
data["words_missing_vectors"].update([word])
|
||||||
if "ner" in pipeline:
|
if "ner" in factory_names:
|
||||||
for i, label in enumerate(eg.get_aligned_ner()):
|
for i, label in enumerate(eg.get_aligned_ner()):
|
||||||
if label is None:
|
if label is None:
|
||||||
continue
|
continue
|
||||||
|
@ -595,14 +619,14 @@ def _compile_gold(
|
||||||
data["ner"][combined_label] += 1
|
data["ner"][combined_label] += 1
|
||||||
elif label == "-":
|
elif label == "-":
|
||||||
data["ner"]["-"] += 1
|
data["ner"]["-"] += 1
|
||||||
if "textcat" in pipeline:
|
if "textcat" in factory_names:
|
||||||
data["cats"].update(gold.cats)
|
data["cats"].update(gold.cats)
|
||||||
if list(gold.cats.values()).count(1.0) != 1:
|
if list(gold.cats.values()).count(1.0) != 1:
|
||||||
data["n_cats_multilabel"] += 1
|
data["n_cats_multilabel"] += 1
|
||||||
if "tagger" in pipeline:
|
if "tagger" in factory_names:
|
||||||
tags = eg.get_aligned("TAG", as_string=True)
|
tags = eg.get_aligned("TAG", as_string=True)
|
||||||
data["tags"].update([x for x in tags if x is not None])
|
data["tags"].update([x for x in tags if x is not None])
|
||||||
if "parser" in pipeline:
|
if "parser" in factory_names:
|
||||||
aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
|
aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
|
||||||
data["deps"].update([x for x in aligned_deps if x is not None])
|
data["deps"].update([x for x in aligned_deps if x is not None])
|
||||||
for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
|
for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
|
||||||
|
from thinc.api import Model
|
||||||
|
import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, debug_cli
|
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..lang.en import English
|
from ..lang.en import English
|
||||||
|
|
||||||
|
@ -10,8 +13,10 @@ from ..lang.en import English
|
||||||
@debug_cli.command("model")
|
@debug_cli.command("model")
|
||||||
def debug_model_cli(
|
def debug_model_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||||
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
|
section: str = Arg(..., help="Section that defines the model to be analysed"),
|
||||||
|
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
|
||||||
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
||||||
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
|
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
|
||||||
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
|
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
|
||||||
|
@ -20,14 +25,18 @@ def debug_model_cli(
|
||||||
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
|
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
|
||||||
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
|
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
|
||||||
P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
|
P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
|
||||||
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
|
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU")
|
||||||
seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Analyze a Thinc model implementation. Includes checks for internal structure
|
Analyze a Thinc model implementation. Includes checks for internal structure
|
||||||
and activations during training.
|
and activations during training.
|
||||||
"""
|
"""
|
||||||
|
if use_gpu >= 0:
|
||||||
|
msg.info("Using GPU")
|
||||||
|
require_gpu(use_gpu)
|
||||||
|
else:
|
||||||
|
msg.info("Using CPU")
|
||||||
print_settings = {
|
print_settings = {
|
||||||
"dimensions": dimensions,
|
"dimensions": dimensions,
|
||||||
"parameters": parameters,
|
"parameters": parameters,
|
||||||
|
@ -39,27 +48,47 @@ def debug_model_cli(
|
||||||
"print_after_training": P2,
|
"print_after_training": P2,
|
||||||
"print_prediction": P3,
|
"print_prediction": P3,
|
||||||
}
|
}
|
||||||
|
config_overrides = parse_config_overrides(ctx.args)
|
||||||
|
cfg = Config().from_disk(config_path)
|
||||||
|
with show_validation_error():
|
||||||
|
try:
|
||||||
|
_, config = util.load_model_from_config(cfg, overrides=config_overrides)
|
||||||
|
except ValueError as e:
|
||||||
|
msg.fail(str(e), exits=1)
|
||||||
|
seed = config["pretraining"]["seed"]
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
msg.info(f"Fixing random seed: {seed}")
|
msg.info(f"Fixing random seed: {seed}")
|
||||||
fix_random_seed(seed)
|
fix_random_seed(seed)
|
||||||
if use_gpu >= 0:
|
|
||||||
msg.info(f"Using GPU: {use_gpu}")
|
component = config
|
||||||
require_gpu(use_gpu)
|
parts = section.split(".")
|
||||||
|
for item in parts:
|
||||||
|
try:
|
||||||
|
component = component[item]
|
||||||
|
except KeyError:
|
||||||
|
msg.fail(
|
||||||
|
f"The section '{section}' is not a valid section in the provided config.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if hasattr(component, "model"):
|
||||||
|
model = component.model
|
||||||
else:
|
else:
|
||||||
msg.info(f"Using CPU")
|
msg.fail(
|
||||||
|
f"The section '{section}' does not specify an object that holds a Model.",
|
||||||
debug_model(
|
exits=1,
|
||||||
config_path, print_settings=print_settings,
|
)
|
||||||
)
|
debug_model(model, print_settings=print_settings)
|
||||||
|
|
||||||
|
|
||||||
def debug_model(config_path: Path, *, print_settings=None):
|
def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
|
||||||
|
if not isinstance(model, Model):
|
||||||
|
msg.fail(
|
||||||
|
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
if print_settings is None:
|
if print_settings is None:
|
||||||
print_settings = {}
|
print_settings = {}
|
||||||
|
|
||||||
model = util.load_config(config_path, create_objects=True)["model"]
|
|
||||||
|
|
||||||
# STEP 0: Printing before training
|
# STEP 0: Printing before training
|
||||||
msg.info(f"Analysing model with ID {model.id}")
|
msg.info(f"Analysing model with ID {model.id}")
|
||||||
if print_settings.get("print_before_training"):
|
if print_settings.get("print_before_training"):
|
||||||
|
@ -67,7 +96,9 @@ def debug_model(config_path: Path, *, print_settings=None):
|
||||||
_print_model(model, print_settings)
|
_print_model(model, print_settings)
|
||||||
|
|
||||||
# STEP 1: Initializing the model and printing again
|
# STEP 1: Initializing the model and printing again
|
||||||
model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
|
Y = _get_output(model.ops.xp)
|
||||||
|
_set_output_dim(nO=Y.shape[-1], model=model)
|
||||||
|
model.initialize(X=_get_docs(), Y=Y)
|
||||||
if print_settings.get("print_after_init"):
|
if print_settings.get("print_after_init"):
|
||||||
msg.info(f"After initialization:")
|
msg.info(f"After initialization:")
|
||||||
_print_model(model, print_settings)
|
_print_model(model, print_settings)
|
||||||
|
@ -110,12 +141,16 @@ def _get_docs():
|
||||||
|
|
||||||
|
|
||||||
def _get_output(xp):
|
def _get_output(xp):
|
||||||
return xp.asarray(
|
return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
|
||||||
[
|
|
||||||
xp.asarray([i + 10, i + 20, i + 30], dtype="float32")
|
|
||||||
for i, _ in enumerate(_get_docs())
|
def _set_output_dim(model, nO):
|
||||||
]
|
# the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
|
||||||
)
|
if model.has_dim("nO") is None:
|
||||||
|
model.set_dim("nO", nO)
|
||||||
|
if model.has_ref("output_layer"):
|
||||||
|
if model.get_ref("output_layer").has_dim("nO") is None:
|
||||||
|
model.get_ref("output_layer").set_dim("nO", nO)
|
||||||
|
|
||||||
|
|
||||||
def _print_model(model, print_settings):
|
def _print_model(model, print_settings):
|
||||||
|
|
|
@ -105,9 +105,10 @@ def evaluate(
|
||||||
print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
|
print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
|
||||||
|
|
||||||
if displacy_path:
|
if displacy_path:
|
||||||
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
docs = [ex.predicted for ex in dev_dataset]
|
docs = [ex.predicted for ex in dev_dataset]
|
||||||
render_deps = "parser" in nlp.meta.get("pipeline", [])
|
render_deps = "parser" in factory_names
|
||||||
render_ents = "ner" in nlp.meta.get("pipeline", [])
|
render_ents = "ner" in factory_names
|
||||||
render_parses(
|
render_parses(
|
||||||
docs,
|
docs,
|
||||||
displacy_path,
|
displacy_path,
|
||||||
|
|
|
@ -84,7 +84,6 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta["link"] = str(model_path)
|
|
||||||
meta["source"] = str(model_path.resolve())
|
meta["source"] = str(model_path.resolve())
|
||||||
else:
|
else:
|
||||||
meta["source"] = str(model_path)
|
meta["source"] = str(model_path)
|
||||||
|
|
|
@ -125,7 +125,6 @@ def get_meta(
|
||||||
meta.update(existing_meta)
|
meta.update(existing_meta)
|
||||||
nlp = util.load_model_from_path(Path(model_path))
|
nlp = util.load_model_from_path(Path(model_path))
|
||||||
meta["spacy_version"] = util.get_model_version_range(about.__version__)
|
meta["spacy_version"] = util.get_model_version_range(about.__version__)
|
||||||
meta["pipeline"] = nlp.pipe_names
|
|
||||||
meta["vectors"] = {
|
meta["vectors"] = {
|
||||||
"width": nlp.vocab.vectors_length,
|
"width": nlp.vocab.vectors_length,
|
||||||
"vectors": len(nlp.vocab.vectors),
|
"vectors": len(nlp.vocab.vectors),
|
||||||
|
|
|
@ -5,7 +5,7 @@ import time
|
||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
|
||||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||||
from thinc.api import CosineDistance, L2Distance
|
from thinc.api import CosineDistance, L2Distance
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
@ -15,7 +15,6 @@ import typer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code
|
from ._util import import_code
|
||||||
from ..schemas import ConfigSchema
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||||
|
@ -37,6 +36,7 @@ def pretrain_cli(
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
|
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
|
||||||
|
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -67,6 +67,7 @@ def pretrain_cli(
|
||||||
config_overrides=overrides,
|
config_overrides=overrides,
|
||||||
resume_path=resume_path,
|
resume_path=resume_path,
|
||||||
epoch_resume=epoch_resume,
|
epoch_resume=epoch_resume,
|
||||||
|
use_gpu=use_gpu,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,40 +78,29 @@ def pretrain(
|
||||||
config_overrides: Dict[str, Any] = {},
|
config_overrides: Dict[str, Any] = {},
|
||||||
resume_path: Optional[Path] = None,
|
resume_path: Optional[Path] = None,
|
||||||
epoch_resume: Optional[int] = None,
|
epoch_resume: Optional[int] = None,
|
||||||
|
use_gpu: int = -1,
|
||||||
):
|
):
|
||||||
verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
|
verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
|
||||||
msg.info(f"Loading config from: {config_path}")
|
|
||||||
with show_validation_error():
|
|
||||||
config = util.load_config(
|
|
||||||
config_path,
|
|
||||||
create_objects=False,
|
|
||||||
validate=True,
|
|
||||||
schema=ConfigSchema,
|
|
||||||
overrides=config_overrides,
|
|
||||||
)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
msg.good(f"Created output directory: {output_dir}")
|
|
||||||
|
|
||||||
use_gpu = config["training"]["use_gpu"]
|
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info("Using GPU")
|
msg.info("Using GPU")
|
||||||
require_gpu(use_gpu)
|
require_gpu(use_gpu)
|
||||||
else:
|
else:
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
|
msg.info(f"Loading config from: {config_path}")
|
||||||
|
config = Config().from_disk(config_path)
|
||||||
|
with show_validation_error():
|
||||||
|
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
|
||||||
|
# TODO: validate that [pretraining] block exists
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
msg.good(f"Created output directory: {output_dir}")
|
||||||
seed = config["pretraining"]["seed"]
|
seed = config["pretraining"]["seed"]
|
||||||
if seed is not None:
|
if seed is not None:
|
||||||
fix_random_seed(seed)
|
fix_random_seed(seed)
|
||||||
if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
|
if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
|
||||||
use_pytorch_for_gpu_memory()
|
use_pytorch_for_gpu_memory()
|
||||||
|
config.to_disk(output_dir / "config.cfg")
|
||||||
nlp_config = config["nlp"]
|
|
||||||
srsly.write_json(output_dir / "config.json", config)
|
|
||||||
msg.good("Saved config file in the output directory")
|
msg.good("Saved config file in the output directory")
|
||||||
|
|
||||||
config = util.load_config(config_path, create_objects=True)
|
|
||||||
nlp = util.load_model_from_config(nlp_config)
|
|
||||||
pretrain_config = config["pretraining"]
|
pretrain_config = config["pretraining"]
|
||||||
|
|
||||||
if texts_loc != "-": # reading from a file
|
if texts_loc != "-": # reading from a file
|
||||||
|
|
|
@ -25,7 +25,7 @@ def profile_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
Profile which functions take the most time in a spaCy pipeline.
|
||||||
Input should be formatted as one JSON object per line with a key "text".
|
Input should be formatted as one JSON object per line with a key "text".
|
||||||
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||||
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Dict, Any
|
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import srsly
|
import srsly
|
||||||
import tqdm
|
import tqdm
|
||||||
|
@ -7,6 +7,7 @@ from wasabi import msg
|
||||||
import thinc
|
import thinc
|
||||||
import thinc.schedules
|
import thinc.schedules
|
||||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
|
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
|
||||||
|
from thinc.api import Config, Optimizer
|
||||||
import random
|
import random
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
|
@ -14,18 +15,15 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code
|
from ._util import import_code
|
||||||
from ..gold import Corpus, Example
|
from ..gold import Corpus, Example
|
||||||
from ..lookups import Lookups
|
from ..lookups import Lookups
|
||||||
|
from ..language import Language
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..schemas import ConfigSchema
|
|
||||||
|
|
||||||
|
|
||||||
# Don't remove - required to load the built-in architectures
|
# Don't remove - required to load the built-in architectures
|
||||||
from ..ml import models # noqa: F401
|
from ..ml import models # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
registry = util.registry
|
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
"train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
"train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||||
)
|
)
|
||||||
|
@ -38,6 +36,8 @@ def train_cli(
|
||||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
|
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -53,9 +53,7 @@ def train_cli(
|
||||||
referenced in the config.
|
referenced in the config.
|
||||||
"""
|
"""
|
||||||
util.set_env_log(verbose)
|
util.set_env_log(verbose)
|
||||||
verify_cli_args(
|
verify_cli_args(train_path, dev_path, config_path)
|
||||||
train_path=train_path, dev_path=dev_path, config_path=config_path,
|
|
||||||
)
|
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
import_code(code_path)
|
import_code(code_path)
|
||||||
train(
|
train(
|
||||||
|
@ -63,6 +61,8 @@ def train_cli(
|
||||||
{"train": train_path, "dev": dev_path},
|
{"train": train_path, "dev": dev_path},
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
config_overrides=overrides,
|
config_overrides=overrides,
|
||||||
|
use_gpu=use_gpu,
|
||||||
|
resume_training=resume,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -72,63 +72,53 @@ def train(
|
||||||
raw_text: Optional[Path] = None,
|
raw_text: Optional[Path] = None,
|
||||||
output_path: Optional[Path] = None,
|
output_path: Optional[Path] = None,
|
||||||
config_overrides: Dict[str, Any] = {},
|
config_overrides: Dict[str, Any] = {},
|
||||||
|
use_gpu: int = -1,
|
||||||
|
resume_training: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
msg.info(f"Loading config from: {config_path}")
|
|
||||||
# Read the config first without creating objects, to get to the original nlp_config
|
|
||||||
with show_validation_error():
|
|
||||||
config = util.load_config(
|
|
||||||
config_path,
|
|
||||||
create_objects=False,
|
|
||||||
schema=ConfigSchema,
|
|
||||||
overrides=config_overrides,
|
|
||||||
)
|
|
||||||
use_gpu = config["training"]["use_gpu"]
|
|
||||||
if use_gpu >= 0:
|
if use_gpu >= 0:
|
||||||
msg.info(f"Using GPU: {use_gpu}")
|
msg.info(f"Using GPU: {use_gpu}")
|
||||||
require_gpu(use_gpu)
|
require_gpu(use_gpu)
|
||||||
else:
|
else:
|
||||||
msg.info("Using CPU")
|
msg.info("Using CPU")
|
||||||
|
msg.info(f"Loading config and nlp from: {config_path}")
|
||||||
|
config = Config().from_disk(config_path)
|
||||||
|
with show_validation_error():
|
||||||
|
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
|
||||||
|
if config["training"]["base_model"]:
|
||||||
|
base_nlp = util.load_model(config["training"]["base_model"])
|
||||||
|
# TODO: do something to check base_nlp against regular nlp described in config?
|
||||||
|
nlp = base_nlp
|
||||||
|
verify_config(nlp)
|
||||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||||
if config["training"]["seed"] is not None:
|
if config["training"]["seed"] is not None:
|
||||||
fix_random_seed(config["training"]["seed"])
|
fix_random_seed(config["training"]["seed"])
|
||||||
if config["training"].get("use_pytorch_for_gpu_memory"):
|
if config["training"]["use_pytorch_for_gpu_memory"]:
|
||||||
# It feels kind of weird to not have a default for this.
|
# It feels kind of weird to not have a default for this.
|
||||||
use_pytorch_for_gpu_memory()
|
use_pytorch_for_gpu_memory()
|
||||||
nlp_config = config["nlp"]
|
|
||||||
config = util.load_config(
|
|
||||||
config_path,
|
|
||||||
create_objects=True,
|
|
||||||
schema=ConfigSchema,
|
|
||||||
overrides=config_overrides,
|
|
||||||
)
|
|
||||||
training = config["training"]
|
training = config["training"]
|
||||||
msg.info("Creating nlp from config")
|
|
||||||
nlp = util.load_model_from_config(nlp_config)
|
|
||||||
optimizer = training["optimizer"]
|
optimizer = training["optimizer"]
|
||||||
limit = training["limit"]
|
limit = training["limit"]
|
||||||
corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
|
corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
|
||||||
if "textcat" in nlp_config["pipeline"]:
|
if resume_training:
|
||||||
verify_textcat_config(nlp, nlp_config)
|
|
||||||
if training.get("resume", False):
|
|
||||||
msg.info("Resuming training")
|
msg.info("Resuming training")
|
||||||
nlp.resume_training()
|
nlp.resume_training()
|
||||||
else:
|
else:
|
||||||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||||
train_examples = list(
|
train_examples = corpus.train_dataset(
|
||||||
corpus.train_dataset(
|
nlp,
|
||||||
nlp,
|
shuffle=False,
|
||||||
shuffle=False,
|
gold_preproc=training["gold_preproc"],
|
||||||
gold_preproc=training["gold_preproc"],
|
max_length=training["max_length"],
|
||||||
max_length=training["max_length"],
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
train_examples = list(train_examples)
|
||||||
nlp.begin_training(lambda: train_examples)
|
nlp.begin_training(lambda: train_examples)
|
||||||
|
|
||||||
# Replace tag map with provided mapping
|
if tag_map:
|
||||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
# Replace tag map with provided mapping
|
||||||
|
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||||
# Load morph rules
|
if morph_rules:
|
||||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
# Load morph rules
|
||||||
|
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||||
|
|
||||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||||
# isn't loaded if these features are accessed
|
# isn't loaded if these features are accessed
|
||||||
|
@ -151,9 +141,8 @@ def train(
|
||||||
for subpath in tok2vec_path.split("."):
|
for subpath in tok2vec_path.split("."):
|
||||||
tok2vec = tok2vec.get(subpath)
|
tok2vec = tok2vec.get(subpath)
|
||||||
if not tok2vec:
|
if not tok2vec:
|
||||||
msg.fail(
|
err = f"Could not locate the tok2vec model at {tok2vec_path}"
|
||||||
f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
|
msg.fail(err, exits=1)
|
||||||
)
|
|
||||||
tok2vec.from_bytes(weights_data)
|
tok2vec.from_bytes(weights_data)
|
||||||
|
|
||||||
msg.info("Loading training corpus")
|
msg.info("Loading training corpus")
|
||||||
|
@ -169,12 +158,11 @@ def train(
|
||||||
evaluate,
|
evaluate,
|
||||||
dropout=training["dropout"],
|
dropout=training["dropout"],
|
||||||
accumulate_gradient=training["accumulate_gradient"],
|
accumulate_gradient=training["accumulate_gradient"],
|
||||||
patience=training.get("patience", 0),
|
patience=training["patience"],
|
||||||
max_steps=training.get("max_steps", 0),
|
max_steps=training["max_steps"],
|
||||||
eval_frequency=training["eval_frequency"],
|
eval_frequency=training["eval_frequency"],
|
||||||
raw_text=raw_text,
|
raw_text=raw_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||||
print_row = setup_printer(training, nlp)
|
print_row = setup_printer(training, nlp)
|
||||||
|
|
||||||
|
@ -209,8 +197,10 @@ def train(
|
||||||
msg.good(f"Saved model to output directory {final_model_path}")
|
msg.good(f"Saved model to output directory {final_model_path}")
|
||||||
|
|
||||||
|
|
||||||
def create_train_batches(nlp, corpus, cfg):
|
def create_train_batches(
|
||||||
max_epochs = cfg.get("max_epochs", 0)
|
nlp: Language, corpus: Corpus, cfg: Union[Config, Dict[str, Any]]
|
||||||
|
):
|
||||||
|
max_epochs = cfg["max_epochs"]
|
||||||
train_examples = list(
|
train_examples = list(
|
||||||
corpus.train_dataset(
|
corpus.train_dataset(
|
||||||
nlp,
|
nlp,
|
||||||
|
@ -219,9 +209,8 @@ def create_train_batches(nlp, corpus, cfg):
|
||||||
max_length=cfg["max_length"],
|
max_length=cfg["max_length"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
epoch = 0
|
epoch = 0
|
||||||
batch_strategy = cfg.get("batch_by", "sequences")
|
batch_strategy = cfg["batch_by"]
|
||||||
while True:
|
while True:
|
||||||
if len(train_examples) == 0:
|
if len(train_examples) == 0:
|
||||||
raise ValueError(Errors.E988)
|
raise ValueError(Errors.E988)
|
||||||
|
@ -241,7 +230,6 @@ def create_train_batches(nlp, corpus, cfg):
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
batches = util.minibatch(train_examples, size=cfg["batch_size"])
|
batches = util.minibatch(train_examples, size=cfg["batch_size"])
|
||||||
|
|
||||||
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
||||||
try:
|
try:
|
||||||
first = next(batches)
|
first = next(batches)
|
||||||
|
@ -255,18 +243,20 @@ def create_train_batches(nlp, corpus, cfg):
|
||||||
random.shuffle(train_examples)
|
random.shuffle(train_examples)
|
||||||
|
|
||||||
|
|
||||||
def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
def create_evaluation_callback(
|
||||||
def evaluate():
|
nlp: Language,
|
||||||
dev_examples = list(
|
optimizer: Optimizer,
|
||||||
corpus.dev_dataset(
|
corpus: Corpus,
|
||||||
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
|
cfg: Union[Config, Dict[str, Any]],
|
||||||
)
|
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||||
|
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||||
|
dev_examples = corpus.dev_dataset(
|
||||||
|
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
|
||||||
)
|
)
|
||||||
|
dev_examples = list(dev_examples)
|
||||||
n_words = sum(len(ex.predicted) for ex in dev_examples)
|
n_words = sum(len(ex.predicted) for ex in dev_examples)
|
||||||
batch_size = cfg.get("evaluation_batch_size", 128)
|
batch_size = cfg["eval_batch_size"]
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
|
|
||||||
if optimizer.averages:
|
if optimizer.averages:
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
|
scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
|
||||||
|
@ -280,12 +270,9 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
||||||
try:
|
try:
|
||||||
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise KeyError(
|
keys = list(scores.keys())
|
||||||
Errors.E983.format(
|
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
|
||||||
dict="score_weights", key=str(e), keys=list(scores.keys())
|
raise KeyError(err)
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
scores["speed"] = wps
|
scores["speed"] = wps
|
||||||
return weighted_score, scores
|
return weighted_score, scores
|
||||||
|
|
||||||
|
@ -293,17 +280,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
||||||
|
|
||||||
|
|
||||||
def train_while_improving(
|
def train_while_improving(
|
||||||
nlp,
|
nlp: Language,
|
||||||
optimizer,
|
optimizer: Optimizer,
|
||||||
train_data,
|
train_data,
|
||||||
evaluate,
|
evaluate,
|
||||||
*,
|
*,
|
||||||
dropout,
|
dropout: float,
|
||||||
eval_frequency,
|
eval_frequency: int,
|
||||||
accumulate_gradient=1,
|
accumulate_gradient: int,
|
||||||
patience=0,
|
patience: int,
|
||||||
max_steps=0,
|
max_steps: int,
|
||||||
raw_text=None,
|
raw_text: List[Dict[str, str]],
|
||||||
):
|
):
|
||||||
"""Train until an evaluation stops improving. Works as a generator,
|
"""Train until an evaluation stops improving. Works as a generator,
|
||||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
||||||
|
@ -414,7 +401,9 @@ def subdivide_batch(batch, accumulate_gradient):
|
||||||
yield subbatch
|
yield subbatch
|
||||||
|
|
||||||
|
|
||||||
def setup_printer(training, nlp):
|
def setup_printer(
|
||||||
|
training: Union[Dict[str, Any], Config], nlp: Language
|
||||||
|
) -> Callable[[Dict[str, Any]], None]:
|
||||||
score_cols = training["scores"]
|
score_cols = training["scores"]
|
||||||
score_widths = [max(len(col), 6) for col in score_cols]
|
score_widths = [max(len(col), 6) for col in score_cols]
|
||||||
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
|
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
|
||||||
|
@ -423,11 +412,10 @@ def setup_printer(training, nlp):
|
||||||
table_header = [col.upper() for col in table_header]
|
table_header = [col.upper() for col in table_header]
|
||||||
table_widths = [3, 6] + loss_widths + score_widths + [6]
|
table_widths = [3, 6] + loss_widths + score_widths + [6]
|
||||||
table_aligns = ["r" for _ in table_widths]
|
table_aligns = ["r" for _ in table_widths]
|
||||||
|
|
||||||
msg.row(table_header, widths=table_widths)
|
msg.row(table_header, widths=table_widths)
|
||||||
msg.row(["-" * width for width in table_widths])
|
msg.row(["-" * width for width in table_widths])
|
||||||
|
|
||||||
def print_row(info):
|
def print_row(info: Dict[str, Any]) -> None:
|
||||||
try:
|
try:
|
||||||
losses = [
|
losses = [
|
||||||
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||||
|
@ -463,7 +451,9 @@ def setup_printer(training, nlp):
|
||||||
return print_row
|
return print_row
|
||||||
|
|
||||||
|
|
||||||
def update_meta(training, nlp, info):
|
def update_meta(
|
||||||
|
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
||||||
|
) -> None:
|
||||||
score_cols = training["scores"]
|
score_cols = training["scores"]
|
||||||
nlp.meta["performance"] = {}
|
nlp.meta["performance"] = {}
|
||||||
for metric in score_cols:
|
for metric in score_cols:
|
||||||
|
@ -472,7 +462,9 @@ def update_meta(training, nlp, info):
|
||||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||||
|
|
||||||
|
|
||||||
def load_from_paths(config):
|
def load_from_paths(
|
||||||
|
config: Config,
|
||||||
|
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
|
||||||
# TODO: separate checks from loading
|
# TODO: separate checks from loading
|
||||||
raw_text = util.ensure_path(config["training"]["raw_text"])
|
raw_text = util.ensure_path(config["training"]["raw_text"])
|
||||||
if raw_text is not None:
|
if raw_text is not None:
|
||||||
|
@ -506,7 +498,7 @@ def verify_cli_args(
|
||||||
dev_path: Path,
|
dev_path: Path,
|
||||||
config_path: Path,
|
config_path: Path,
|
||||||
output_path: Optional[Path] = None,
|
output_path: Optional[Path] = None,
|
||||||
):
|
) -> None:
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not config_path or not config_path.exists():
|
if not config_path or not config_path.exists():
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
|
@ -528,12 +520,23 @@ def verify_cli_args(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def verify_textcat_config(nlp, nlp_config):
|
def verify_config(nlp: Language) -> None:
|
||||||
|
"""Perform additional checks based on the config and loaded nlp object."""
|
||||||
|
# TODO: maybe we should validate based on the actual components, the list
|
||||||
|
# in config["nlp"]["pipeline"] instead?
|
||||||
|
for pipe_config in nlp.config["components"].values():
|
||||||
|
# We can't assume that the component name == the factory
|
||||||
|
factory = pipe_config["@factories"]
|
||||||
|
if factory == "textcat":
|
||||||
|
verify_textcat_config(nlp, pipe_config)
|
||||||
|
|
||||||
|
|
||||||
|
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
||||||
# if 'positive_label' is provided: double check whether it's in the data and
|
# if 'positive_label' is provided: double check whether it's in the data and
|
||||||
# the task is binary
|
# the task is binary
|
||||||
if nlp_config["pipeline"]["textcat"].get("positive_label", None):
|
if pipe_config.get("positive_label"):
|
||||||
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
|
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
|
||||||
pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
|
pos_label = pipe_config.get("positive_label")
|
||||||
if pos_label not in textcat_labels:
|
if pos_label not in textcat_labels:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"The textcat's 'positive_label' config setting '{pos_label}' "
|
f"The textcat's 'positive_label' config setting '{pos_label}' "
|
||||||
|
|
102
spacy/default_config.cfg
Normal file
102
spacy/default_config.cfg
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
[nlp]
|
||||||
|
lang = null
|
||||||
|
stop_words = []
|
||||||
|
lex_attr_getters = {}
|
||||||
|
pipeline = []
|
||||||
|
|
||||||
|
[nlp.tokenizer]
|
||||||
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.writing_system]
|
||||||
|
direction = "ltr"
|
||||||
|
has_case = true
|
||||||
|
has_letters = true
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
# Training hyper-parameters and additional features.
|
||||||
|
[training]
|
||||||
|
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||||
|
# and tokens. If you set this to true, take care to ensure your run-time
|
||||||
|
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||||
|
gold_preproc = false
|
||||||
|
# Limitations on training document length or number of examples.
|
||||||
|
max_length = 5000
|
||||||
|
limit = 0
|
||||||
|
# Data augmentation
|
||||||
|
orth_variant_level = 0.0
|
||||||
|
dropout = 0.1
|
||||||
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
|
patience = 1600
|
||||||
|
max_epochs = 0
|
||||||
|
max_steps = 20000
|
||||||
|
eval_frequency = 200
|
||||||
|
eval_batch_size = 128
|
||||||
|
# Other settings
|
||||||
|
seed = 0
|
||||||
|
accumulate_gradient = 1
|
||||||
|
use_pytorch_for_gpu_memory = false
|
||||||
|
# Control how scores are printed and checkpoints are evaluated.
|
||||||
|
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||||
|
score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
|
||||||
|
# These settings are invalid for the transformer models.
|
||||||
|
init_tok2vec = null
|
||||||
|
discard_oversize = false
|
||||||
|
omit_extra_lookups = false
|
||||||
|
batch_by = "sequences"
|
||||||
|
raw_text = null
|
||||||
|
tag_map = null
|
||||||
|
morph_rules = null
|
||||||
|
base_model = null
|
||||||
|
vectors = null
|
||||||
|
|
||||||
|
[training.batch_size]
|
||||||
|
@schedules = "compounding.v1"
|
||||||
|
start = 1000
|
||||||
|
stop = 1000
|
||||||
|
compound = 1.001
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
beta1 = 0.9
|
||||||
|
beta2 = 0.999
|
||||||
|
L2_is_weight_decay = true
|
||||||
|
L2 = 0.01
|
||||||
|
grad_clip = 1.0
|
||||||
|
use_averages = false
|
||||||
|
eps = 1e-8
|
||||||
|
|
||||||
|
[training.optimizer.learn_rate]
|
||||||
|
@schedules = "warmup_linear.v1"
|
||||||
|
warmup_steps = 250
|
||||||
|
total_steps = 20000
|
||||||
|
initial_rate = 0.001
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
max_epochs = 1000
|
||||||
|
min_length = 5
|
||||||
|
max_length = 500
|
||||||
|
dropout = 0.2
|
||||||
|
n_save_every = null
|
||||||
|
batch_size = 3000
|
||||||
|
seed = ${training:seed}
|
||||||
|
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
|
||||||
|
tok2vec_model = "components.tok2vec.model"
|
||||||
|
|
||||||
|
[pretraining.objective]
|
||||||
|
type = "characters"
|
||||||
|
n_characters = 4
|
||||||
|
|
||||||
|
[pretraining.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
beta1 = 0.9
|
||||||
|
beta2 = 0.999
|
||||||
|
L2_is_weight_decay = true
|
||||||
|
L2 = 0.01
|
||||||
|
grad_clip = 1.0
|
||||||
|
use_averages = true
|
||||||
|
eps = 1e-8
|
||||||
|
learn_rate = 0.001
|
108
spacy/errors.py
108
spacy/errors.py
|
@ -124,20 +124,24 @@ class Warnings:
|
||||||
@add_codes
|
@add_codes
|
||||||
class Errors:
|
class Errors:
|
||||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||||
E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
|
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
||||||
"calls `nlp.create_pipe` with a component name that's not built "
|
"This usually happens when spaCy calls nlp.{method} with custom "
|
||||||
"in - for example, when constructing the pipeline from a model's "
|
"component name that's not registered on the current language class. "
|
||||||
"meta.json. If you're using a custom component, you can write to "
|
"If you're using a custom component, make sure you've added the "
|
||||||
"`Language.factories['{name}']` or remove it from the model meta "
|
"decorator @Language.component (for function components) or "
|
||||||
"and add it via `nlp.add_pipe` instead.")
|
"@Language.factory (for class components).\n\nAvailable "
|
||||||
|
"factories: {opts}")
|
||||||
E003 = ("Not a valid pipeline component. Expected callable, but "
|
E003 = ("Not a valid pipeline component. Expected callable, but "
|
||||||
"got {component} (name: '{name}').")
|
"got {component} (name: '{name}'). If you're using a custom "
|
||||||
E004 = ("If you meant to add a built-in component, use `create_pipe`: "
|
"component factory, double-check that it correctly returns your "
|
||||||
"`nlp.add_pipe(nlp.create_pipe('{component}'))`")
|
"initialized component.")
|
||||||
|
E004 = ("Can't set up pipeline component: a factory for '{name}' already exists.")
|
||||||
E005 = ("Pipeline component '{name}' returned None. If you're using a "
|
E005 = ("Pipeline component '{name}' returned None. If you're using a "
|
||||||
"custom component, maybe you forgot to return the processed Doc?")
|
"custom component, maybe you forgot to return the processed Doc?")
|
||||||
E006 = ("Invalid constraints. You can only set one of the following: "
|
E006 = ("Invalid constraints for adding pipeline component. You can only "
|
||||||
"before, after, first, last.")
|
"set one of the following: before (component name or index), "
|
||||||
|
"after (component name or index), first (True) or last (True). "
|
||||||
|
"Invalid configuration: {args}. Existing components: {opts}")
|
||||||
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
|
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
|
||||||
E008 = ("Some current components would be lost when restoring previous "
|
E008 = ("Some current components would be lost when restoring previous "
|
||||||
"pipeline state. If you added components after calling "
|
"pipeline state. If you added components after calling "
|
||||||
|
@ -184,7 +188,7 @@ class Errors:
|
||||||
"the documentation:\nhttps://spacy.io/usage/models")
|
"the documentation:\nhttps://spacy.io/usage/models")
|
||||||
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
||||||
"component to the pipeline with: "
|
"component to the pipeline with: "
|
||||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')). "
|
"nlp.add_pipe('sentencizer'). "
|
||||||
"Alternatively, add the dependency parser, or set sentence "
|
"Alternatively, add the dependency parser, or set sentence "
|
||||||
"boundaries by setting doc[i].is_sent_start.")
|
"boundaries by setting doc[i].is_sent_start.")
|
||||||
E031 = ("Invalid token: empty string ('') at position {i}.")
|
E031 = ("Invalid token: empty string ('') at position {i}.")
|
||||||
|
@ -365,8 +369,6 @@ class Errors:
|
||||||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||||
"exceed 1, but found {sum}.")
|
"exceed 1, but found {sum}.")
|
||||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||||
E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
|
|
||||||
"`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
|
|
||||||
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
||||||
"to provide a valid JSON object as input with either the `text` "
|
"to provide a valid JSON object as input with either the `text` "
|
||||||
"or `tokens` key. For more info, see the docs:\n"
|
"or `tokens` key. For more info, see the docs:\n"
|
||||||
|
@ -484,6 +486,62 @@ class Errors:
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||||
|
"Available components: {opts}")
|
||||||
|
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||||
|
"spaCy v3. Instead, you can use the @Language.factory decorator "
|
||||||
|
"to register your custom component factory or @Language.component "
|
||||||
|
"to register a simple stateless function component that just takes "
|
||||||
|
"a Doc and returns it.")
|
||||||
|
E958 = ("Language code defined in config ({bad_lang_code}) does not match "
|
||||||
|
"language code of current Language subclass {lang} ({lang_code})")
|
||||||
|
E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
|
||||||
|
E960 = ("No config data found for component '{name}'. This is likely a bug "
|
||||||
|
"in spaCy.")
|
||||||
|
E961 = ("Found non-serializable Python object in config. Configs should "
|
||||||
|
"only include values that can be serialized to JSON. If you need "
|
||||||
|
"to pass models or other objects to your component, use a reference "
|
||||||
|
"to a registered function or initialize the object in your "
|
||||||
|
"component.\n\n{config}")
|
||||||
|
E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
|
||||||
|
"got: {cfg_type}.")
|
||||||
|
E963 = ("Can't read component info from @Language.{decorator} decorator. "
|
||||||
|
"Maybe you forgot to call it? Make sure you're using "
|
||||||
|
"@Language.{decorator}() instead of @Language.{decorator}.")
|
||||||
|
E964 = ("The pipeline component factory for '{name}' needs to have the "
|
||||||
|
"following named arguments, which are passed in by spaCy:\n- nlp: "
|
||||||
|
"receives the current nlp object and lets you access the vocab\n- "
|
||||||
|
"name: the name of the component instance, can be used to identify "
|
||||||
|
"the component, output losses etc.")
|
||||||
|
E965 = ("It looks like you're using the @Language.component decorator to "
|
||||||
|
"register '{name}' on a class instead of a function component. If "
|
||||||
|
"you need to register a class or function that *returns* a component "
|
||||||
|
"function, use the @Language.factory decorator instead.")
|
||||||
|
E966 = ("nlp.add_pipe now takes the string name of the registered component "
|
||||||
|
"factory, not a callable component. Expected string, but got "
|
||||||
|
"{component} (name: '{name}').\n\n- If you created your component "
|
||||||
|
"with nlp.create_pipe('name'): remove nlp.create_pipe and call "
|
||||||
|
"nlp.add_pipe('name') instead.\n\n- If you passed in a component "
|
||||||
|
"like TextCategorizer(): call nlp.add_pipe with the string name "
|
||||||
|
"instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom "
|
||||||
|
"component: Add the decorator @Language.component (for function "
|
||||||
|
"components) or @Language.factory (for class components / factories) "
|
||||||
|
"to your custom component and assign it a name, e.g. "
|
||||||
|
"@Language.component('your_name'). You can then run "
|
||||||
|
"nlp.add_pipe('your_name') to add it to the pipeline.")
|
||||||
|
E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
|
||||||
|
E968 = ("nlp.replace_pipe now takes the string name of the registered component "
|
||||||
|
"factory, not a callable component. Expected string, but got "
|
||||||
|
"{component}.\n\n- If you created your component with"
|
||||||
|
"with nlp.create_pipe('name'): remove nlp.create_pipe and call "
|
||||||
|
"nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a "
|
||||||
|
"component like TextCategorizer(): call nlp.replace_pipe with the "
|
||||||
|
"string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n"
|
||||||
|
"- If you're using a custom component: Add the decorator "
|
||||||
|
"@Language.component (for function components) or @Language.factory "
|
||||||
|
"(for class components / factories) to your custom component and "
|
||||||
|
"assign it a name, e.g. @Language.component('your_name'). You can "
|
||||||
|
"then run nlp.replace_pipe('{name}', 'your_name').")
|
||||||
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
||||||
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
||||||
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
||||||
|
@ -506,10 +564,12 @@ class Errors:
|
||||||
"into {values}, but found {value}.")
|
"into {values}, but found {value}.")
|
||||||
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
|
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
|
||||||
"{keys}")
|
"{keys}")
|
||||||
E985 = ("The pipeline component '{component}' is already available in the base "
|
E984 = ("Invalid component config for '{name}': no @factories key "
|
||||||
"model. The settings in the component block in the config file are "
|
"specifying the registered function used to initialize the "
|
||||||
"being ignored. If you want to replace this component instead, set "
|
"component. For example, @factories = \"ner\" will use the 'ner' "
|
||||||
"'replace' to True in the training configuration.")
|
"factory and all other settings in the block will be passed "
|
||||||
|
"to it as arguments.\n\n{config}")
|
||||||
|
E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
|
||||||
E986 = ("Could not create any training batches: check your input. "
|
E986 = ("Could not create any training batches: check your input. "
|
||||||
"Perhaps discard_oversize should be set to False ?")
|
"Perhaps discard_oversize should be set to False ?")
|
||||||
E987 = ("The text of an example training instance is either a Doc or "
|
E987 = ("The text of an example training instance is either a Doc or "
|
||||||
|
@ -530,9 +590,9 @@ class Errors:
|
||||||
E992 = ("The function `select_pipes` was called with `enable`={enable} "
|
E992 = ("The function `select_pipes` was called with `enable`={enable} "
|
||||||
"and `disable`={disable} but that information is conflicting "
|
"and `disable`={disable} but that information is conflicting "
|
||||||
"for the `nlp` pipeline with components {names}.")
|
"for the `nlp` pipeline with components {names}.")
|
||||||
E993 = ("The config for 'nlp' should include either a key 'name' to "
|
E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
|
||||||
"refer to an existing model by name or path, or a key 'lang' "
|
"the code of the language to initialize it with (for example "
|
||||||
"to create a new blank model.")
|
"'en' for English).\n\n{config}")
|
||||||
E996 = ("Could not parse {file}: {msg}")
|
E996 = ("Could not parse {file}: {msg}")
|
||||||
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||||
|
@ -540,9 +600,9 @@ class Errors:
|
||||||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||||
"the same `Vocab`.")
|
"the same `Vocab`.")
|
||||||
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
||||||
"initializing the pipeline: "
|
"initializing the pipeline:\n"
|
||||||
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
|
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
|
||||||
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
|
'nlp = Chinese(config=cfg)')
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .conll_ner2docs import n_sents_info
|
from .conll_ner2docs import n_sents_info
|
||||||
from ...gold import Example
|
|
||||||
from ...gold import iob_to_biluo, spans_from_biluo_tags
|
from ...gold import iob_to_biluo, spans_from_biluo_tags
|
||||||
from ...language import Language
|
|
||||||
from ...tokens import Doc, Token, Span
|
from ...tokens import Doc, Token, Span
|
||||||
|
from ...vocab import Vocab
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,7 +72,7 @@ def read_conllx(
|
||||||
ner_map=None,
|
ner_map=None,
|
||||||
):
|
):
|
||||||
""" Yield docs, one for each sentence """
|
""" Yield docs, one for each sentence """
|
||||||
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
|
vocab = Vocab() # need vocab to make a minimal Doc
|
||||||
for sent in input_data.strip().split("\n\n"):
|
for sent in input_data.strip().split("\n\n"):
|
||||||
lines = sent.strip().split("\n")
|
lines = sent.strip().split("\n")
|
||||||
if lines:
|
if lines:
|
||||||
|
|
|
@ -1,17 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class AfrikaansDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "af"
|
lang = "af"
|
||||||
stop_words = STOP_WORDS
|
stop_words = {"@language_data": "spacy.af.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.af.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Afrikaans(Language):
|
class Afrikaans(Language):
|
||||||
lang = "af"
|
lang = "af"
|
||||||
Defaults = AfrikaansDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Afrikaans"]
|
__all__ = ["Afrikaans"]
|
||||||
|
|
|
@ -1,31 +1,48 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "ar"
|
||||||
|
stop_words = {"@language_data": "spacy.ar.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.writing_system]
|
||||||
|
direction = "rtl"
|
||||||
|
has_case = false
|
||||||
|
has_letters = true
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ar.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ar.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class ArabicDefaults(Language.Defaults):
|
class ArabicDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "ar"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
|
||||||
|
|
||||||
|
|
||||||
class Arabic(Language):
|
class Arabic(Language):
|
||||||
lang = "ar"
|
lang = "ar"
|
||||||
Defaults = ArabicDefaults
|
Defaults = ArabicDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Arabic"]
|
__all__ = ["Arabic"]
|
||||||
|
|
|
@ -1,17 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class BulgarianDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "bg"
|
lang = "bg"
|
||||||
stop_words = STOP_WORDS
|
stop_words = {"@language_data": "spacy.bg.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.bg.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Bulgarian(Language):
|
class Bulgarian(Language):
|
||||||
lang = "bg"
|
lang = "bg"
|
||||||
Defaults = BulgarianDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bulgarian"]
|
__all__ = ["Bulgarian"]
|
||||||
|
|
|
@ -1,18 +1,35 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "bn"
|
||||||
|
stop_words = {"@language_data": "spacy.bn.stop_words"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.bn.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "bn"
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
@ -21,6 +38,7 @@ class BengaliDefaults(Language.Defaults):
|
||||||
class Bengali(Language):
|
class Bengali(Language):
|
||||||
lang = "bn"
|
lang = "bn"
|
||||||
Defaults = BengaliDefaults
|
Defaults = BengaliDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Bengali"]
|
__all__ = ["Bengali"]
|
||||||
|
|
|
@ -1,31 +1,49 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "ca"
|
||||||
|
stop_words = {"@language_data": "spacy.ca.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ca.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ca.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(Language.Defaults):
|
class CatalanDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "ca"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class Catalan(Language):
|
class Catalan(Language):
|
||||||
lang = "ca"
|
lang = "ca"
|
||||||
Defaults = CatalanDefaults
|
Defaults = CatalanDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Catalan"]
|
__all__ = ["Catalan"]
|
||||||
|
|
|
@ -1,17 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class CzechDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "cs"
|
lang = "cs"
|
||||||
stop_words = STOP_WORDS
|
stop_words = {"@language_data": "spacy.cs.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.cs.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Czech(Language):
|
class Czech(Language):
|
||||||
lang = "cs"
|
lang = "cs"
|
||||||
Defaults = CzechDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Czech"]
|
__all__ = ["Czech"]
|
||||||
|
|
|
@ -1,27 +1,50 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "da"
|
||||||
|
stop_words = {"@language_data": "spacy.da.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.da.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.da.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(Language.Defaults):
|
class DanishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "da"
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Danish(Language):
|
class Danish(Language):
|
||||||
lang = "da"
|
lang = "da"
|
||||||
Defaults = DanishDefaults
|
Defaults = DanishDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Danish"]
|
__all__ = ["Danish"]
|
||||||
|
|
|
@ -1,23 +1,40 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "de"
|
||||||
|
stop_words = {"@language_data": "spacy.de.stop_words"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.de.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(Language.Defaults):
|
class GermanDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "de"
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
stop_words = STOP_WORDS
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
single_orth_variants = [
|
single_orth_variants = [
|
||||||
{"tags": ["$("], "variants": ["…", "..."]},
|
{"tags": ["$("], "variants": ["…", "..."]},
|
||||||
|
@ -38,6 +55,7 @@ class GermanDefaults(Language.Defaults):
|
||||||
class German(Language):
|
class German(Language):
|
||||||
lang = "de"
|
lang = "de"
|
||||||
Defaults = GermanDefaults
|
Defaults = GermanDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["German"]
|
__all__ = ["German"]
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -6,32 +9,51 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
from ...util import update_exc, registry
|
||||||
from ...attrs import LANG
|
|
||||||
from ...util import update_exc
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "el"
|
||||||
|
stop_words = {"@language_data": "spacy.el.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.GreekLemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
|
||||||
|
def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
|
||||||
|
return GreekLemmatizer(data_paths=data_paths)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.el.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.el.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(Language.Defaults):
|
class GreekDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "el"
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
|
||||||
if lookups is None:
|
|
||||||
lookups = Lookups()
|
|
||||||
return GreekLemmatizer(lookups)
|
|
||||||
|
|
||||||
|
|
||||||
class Greek(Language):
|
class Greek(Language):
|
||||||
lang = "el"
|
lang = "el"
|
||||||
Defaults = GreekDefaults
|
Defaults = GreekDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Greek"]
|
__all__ = ["Greek"]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,7 +14,13 @@ class GreekLemmatizer(Lemmatizer):
|
||||||
not applicable for Greek language.
|
not applicable for Greek language.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def lemmatize(self, string, index, exceptions, rules):
|
def lemmatize(
|
||||||
|
self,
|
||||||
|
string: str,
|
||||||
|
index: Dict[str, List[str]],
|
||||||
|
exceptions: Dict[str, Dict[str, List[str]]],
|
||||||
|
rules: Dict[str, List[List[str]]],
|
||||||
|
) -> List[str]:
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
if string in index:
|
if string in index:
|
||||||
|
|
|
@ -1,25 +1,50 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .lemmatizer import is_base_form
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...lemmatizer import Lemmatizer
|
||||||
from ...util import update_exc
|
from ...util import update_exc, registry
|
||||||
|
|
||||||
|
|
||||||
def _return_en(_):
|
DEFAULT_CONFIG = """
|
||||||
return "en"
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
stop_words = {"@language_data": "spacy.en.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.EnglishLemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.en.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.en.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
|
||||||
|
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
|
||||||
|
return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = _return_en
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
single_orth_variants = [
|
single_orth_variants = [
|
||||||
|
@ -31,45 +56,11 @@ class EnglishDefaults(Language.Defaults):
|
||||||
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
|
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
|
||||||
]
|
]
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def is_base_form(cls, univ_pos, morphology=None):
|
|
||||||
"""
|
|
||||||
Check whether we're dealing with an uninflected paradigm, so we can
|
|
||||||
avoid lemmatization entirely.
|
|
||||||
|
|
||||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
|
||||||
morphology (dict): The token's morphological features following the
|
|
||||||
Universal Dependencies scheme.
|
|
||||||
"""
|
|
||||||
if morphology is None:
|
|
||||||
morphology = {}
|
|
||||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
|
||||||
return True
|
|
||||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
|
||||||
return True
|
|
||||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
|
||||||
# morphology
|
|
||||||
elif univ_pos == "verb" and (
|
|
||||||
morphology.get("VerbForm") == "fin"
|
|
||||||
and morphology.get("Tense") == "pres"
|
|
||||||
and morphology.get("Number") is None
|
|
||||||
):
|
|
||||||
return True
|
|
||||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
|
||||||
return True
|
|
||||||
elif morphology.get("VerbForm") == "inf":
|
|
||||||
return True
|
|
||||||
elif morphology.get("VerbForm") == "none":
|
|
||||||
return True
|
|
||||||
elif morphology.get("Degree") == "pos":
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
lang = "en"
|
lang = "en"
|
||||||
Defaults = EnglishDefaults
|
Defaults = EnglishDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["English"]
|
__all__ = ["English"]
|
||||||
|
|
36
spacy/lang/en/lemmatizer.py
Normal file
36
spacy/lang/en/lemmatizer.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
|
||||||
|
"""
|
||||||
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
|
avoid lemmatization entirely.
|
||||||
|
|
||||||
|
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||||
|
morphology (dict): The token's morphological features following the
|
||||||
|
Universal Dependencies scheme.
|
||||||
|
"""
|
||||||
|
if morphology is None:
|
||||||
|
morphology = {}
|
||||||
|
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||||
|
return True
|
||||||
|
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||||
|
return True
|
||||||
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||||
|
# morphology
|
||||||
|
elif univ_pos == "verb" and (
|
||||||
|
morphology.get("VerbForm") == "fin"
|
||||||
|
and morphology.get("Tense") == "pres"
|
||||||
|
and morphology.get("Number") is None
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||||
|
return True
|
||||||
|
elif morphology.get("VerbForm") == "inf":
|
||||||
|
return True
|
||||||
|
elif morphology.get("VerbForm") == "none":
|
||||||
|
return True
|
||||||
|
elif morphology.get("Degree") == "pos":
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
|
@ -1,47 +1,17 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"zero",
|
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
|
||||||
"one",
|
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
||||||
"two",
|
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
|
||||||
"three",
|
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
|
||||||
"four",
|
"million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
|
||||||
"five",
|
|
||||||
"six",
|
|
||||||
"seven",
|
|
||||||
"eight",
|
|
||||||
"nine",
|
|
||||||
"ten",
|
|
||||||
"eleven",
|
|
||||||
"twelve",
|
|
||||||
"thirteen",
|
|
||||||
"fourteen",
|
|
||||||
"fifteen",
|
|
||||||
"sixteen",
|
|
||||||
"seventeen",
|
|
||||||
"eighteen",
|
|
||||||
"nineteen",
|
|
||||||
"twenty",
|
|
||||||
"thirty",
|
|
||||||
"forty",
|
|
||||||
"fifty",
|
|
||||||
"sixty",
|
|
||||||
"seventy",
|
|
||||||
"eighty",
|
|
||||||
"ninety",
|
|
||||||
"hundred",
|
|
||||||
"thousand",
|
|
||||||
"million",
|
|
||||||
"billion",
|
|
||||||
"trillion",
|
|
||||||
"quadrillion",
|
|
||||||
"gajillion",
|
|
||||||
"bazillion",
|
|
||||||
]
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text: str) -> bool:
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(",", "").replace(".", "")
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
|
|
@ -1,33 +1,52 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.config import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "es"
|
||||||
|
stop_words = {"@language_data": "spacy.es.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.es.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.es.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class SpanishDefaults(Language.Defaults):
|
class SpanishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "es"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Spanish(Language):
|
class Spanish(Language):
|
||||||
lang = "es"
|
lang = "es"
|
||||||
Defaults = SpanishDefaults
|
Defaults = SpanishDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Spanish"]
|
__all__ = ["Spanish"]
|
||||||
|
|
|
@ -1,17 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class EstonianDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "et"
|
lang = "et"
|
||||||
stop_words = STOP_WORDS
|
stop_words = {"@language_data": "spacy.et.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.et.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Estonian(Language):
|
class Estonian(Language):
|
||||||
lang = "et"
|
lang = "et"
|
||||||
Defaults = EstonianDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Estonian"]
|
__all__ = ["Estonian"]
|
||||||
|
|
|
@ -1,25 +1,41 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "eu"
|
||||||
|
stop_words = {"@language_data": "spacy.eu.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.eu.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.eu.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class BasqueDefaults(Language.Defaults):
|
class BasqueDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "eu"
|
|
||||||
|
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
class Basque(Language):
|
class Basque(Language):
|
||||||
lang = "eu"
|
lang = "eu"
|
||||||
Defaults = BasqueDefaults
|
Defaults = BasqueDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Basque"]
|
__all__ = ["Basque"]
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -9,23 +10,46 @@ from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "fa"
|
||||||
|
stop_words = {"@language_data": "spacy.fa.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.writing_system]
|
||||||
|
direction = "rtl"
|
||||||
|
has_case = false
|
||||||
|
has_letters = true
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.fa.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.fa.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class PersianDefaults(Language.Defaults):
|
class PersianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "fa"
|
|
||||||
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Persian(Language):
|
class Persian(Language):
|
||||||
lang = "fa"
|
lang = "fa"
|
||||||
Defaults = PersianDefaults
|
Defaults = PersianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Persian"]
|
__all__ = ["Persian"]
|
||||||
|
|
|
@ -1,31 +1,43 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "fi"
|
||||||
|
stop_words = {"@language_data": "spacy.fi.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.fi.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.fi.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class FinnishDefaults(Language.Defaults):
|
class FinnishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "fi"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Finnish(Language):
|
class Finnish(Language):
|
||||||
lang = "fi"
|
lang = "fi"
|
||||||
Defaults = FinnishDefaults
|
Defaults = FinnishDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Finnish"]
|
__all__ = ["Finnish"]
|
||||||
|
|
|
@ -1,44 +1,61 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import FrenchLemmatizer
|
from .lemmatizer import FrenchLemmatizer, is_base_form
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
from ...util import update_exc, registry
|
||||||
from ...attrs import LANG, NORM
|
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "fr"
|
||||||
|
stop_words = {"@language_data": "spacy.fr.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.FrenchLemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
|
||||||
|
def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
|
||||||
|
return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.fr.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.fr.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class FrenchDefaults(Language.Defaults):
|
class FrenchDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "fr"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
|
||||||
if lookups is None:
|
|
||||||
lookups = Lookups()
|
|
||||||
return FrenchLemmatizer(lookups)
|
|
||||||
|
|
||||||
|
|
||||||
class French(Language):
|
class French(Language):
|
||||||
lang = "fr"
|
lang = "fr"
|
||||||
Defaults = FrenchDefaults
|
Defaults = FrenchDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["French"]
|
__all__ = ["French"]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from typing import Optional, List, Dict
|
||||||
|
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
||||||
from ...symbols import SCONJ, CCONJ
|
from ...symbols import SCONJ, CCONJ
|
||||||
|
@ -13,7 +15,9 @@ class FrenchLemmatizer(Lemmatizer):
|
||||||
the lookup table.
|
the lookup table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(
|
||||||
|
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||||
|
) -> List[str]:
|
||||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
if "lemma_rules" not in self.lookups:
|
if "lemma_rules" not in self.lookups:
|
||||||
return [lookup_table.get(string, string)]
|
return [lookup_table.get(string, string)]
|
||||||
|
@ -52,62 +56,19 @@ class FrenchLemmatizer(Lemmatizer):
|
||||||
)
|
)
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
def is_base_form(self, univ_pos, morphology=None):
|
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||||
"""
|
|
||||||
Check whether we're dealing with an uninflected paradigm, so we can
|
|
||||||
avoid lemmatization entirely.
|
|
||||||
"""
|
|
||||||
morphology = {} if morphology is None else morphology
|
|
||||||
others = [
|
|
||||||
key
|
|
||||||
for key in morphology
|
|
||||||
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
|
|
||||||
]
|
|
||||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
|
||||||
return True
|
|
||||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
|
||||||
return True
|
|
||||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
|
||||||
# morphology
|
|
||||||
elif univ_pos == "verb" and (
|
|
||||||
morphology.get("VerbForm") == "fin"
|
|
||||||
and morphology.get("Tense") == "pres"
|
|
||||||
and morphology.get("Number") is None
|
|
||||||
and not others
|
|
||||||
):
|
|
||||||
return True
|
|
||||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
|
||||||
return True
|
|
||||||
elif "VerbForm=inf" in morphology:
|
|
||||||
return True
|
|
||||||
elif "VerbForm=none" in morphology:
|
|
||||||
return True
|
|
||||||
elif "Number=sing" in morphology:
|
|
||||||
return True
|
|
||||||
elif "Degree=pos" in morphology:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def noun(self, string, morphology=None):
|
|
||||||
return self(string, "noun", morphology)
|
|
||||||
|
|
||||||
def verb(self, string, morphology=None):
|
|
||||||
return self(string, "verb", morphology)
|
|
||||||
|
|
||||||
def adj(self, string, morphology=None):
|
|
||||||
return self(string, "adj", morphology)
|
|
||||||
|
|
||||||
def punct(self, string, morphology=None):
|
|
||||||
return self(string, "punct", morphology)
|
|
||||||
|
|
||||||
def lookup(self, string, orth=None):
|
|
||||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
if orth is not None and orth in lookup_table:
|
if orth is not None and orth in lookup_table:
|
||||||
return lookup_table[orth][0]
|
return lookup_table[orth][0]
|
||||||
return string
|
return string
|
||||||
|
|
||||||
def lemmatize(self, string, index, exceptions, rules):
|
def lemmatize(
|
||||||
|
self,
|
||||||
|
string: str,
|
||||||
|
index: Dict[str, List[str]],
|
||||||
|
exceptions: Dict[str, Dict[str, List[str]]],
|
||||||
|
rules: Dict[str, List[List[str]]],
|
||||||
|
) -> List[str]:
|
||||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
|
@ -133,3 +94,41 @@ class FrenchLemmatizer(Lemmatizer):
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
return list(set(forms))
|
return list(set(forms))
|
||||||
|
|
||||||
|
|
||||||
|
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
|
||||||
|
"""
|
||||||
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
|
avoid lemmatization entirely.
|
||||||
|
"""
|
||||||
|
morphology = {} if morphology is None else morphology
|
||||||
|
others = [
|
||||||
|
key
|
||||||
|
for key in morphology
|
||||||
|
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
|
||||||
|
]
|
||||||
|
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||||
|
return True
|
||||||
|
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||||
|
return True
|
||||||
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||||
|
# morphology
|
||||||
|
elif univ_pos == "verb" and (
|
||||||
|
morphology.get("VerbForm") == "fin"
|
||||||
|
and morphology.get("Tense") == "pres"
|
||||||
|
and morphology.get("Number") is None
|
||||||
|
and not others
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||||
|
return True
|
||||||
|
elif "VerbForm=inf" in morphology:
|
||||||
|
return True
|
||||||
|
elif "VerbForm=none" in morphology:
|
||||||
|
return True
|
||||||
|
elif "Number=sing" in morphology:
|
||||||
|
return True
|
||||||
|
elif "Degree=pos" in morphology:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
|
@ -1,23 +1,33 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "ga"
|
||||||
|
stop_words = {"@language_data": "spacy.ga.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ga.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class IrishDefaults(Language.Defaults):
|
class IrishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "ga"
|
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
|
||||||
|
|
||||||
|
|
||||||
class Irish(Language):
|
class Irish(Language):
|
||||||
lang = "ga"
|
lang = "ga"
|
||||||
Defaults = IrishDefaults
|
Defaults = IrishDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Irish"]
|
__all__ = ["Irish"]
|
||||||
|
|
|
@ -1,15 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class GujaratiDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
stop_words = STOP_WORDS
|
[nlp]
|
||||||
|
lang = "gu"
|
||||||
|
stop_words = {"@language_data": "spacy.gu.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.gu.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Gujarati(Language):
|
class Gujarati(Language):
|
||||||
lang = "gu"
|
lang = "gu"
|
||||||
Defaults = GujaratiDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Gujarati"]
|
__all__ = ["Gujarati"]
|
||||||
|
|
|
@ -1,22 +1,37 @@
|
||||||
from .stop_words import STOP_WORDS
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "he"
|
||||||
|
stop_words = {"@language_data": "spacy.he.stop_words"}
|
||||||
|
|
||||||
|
[nlp.writing_system]
|
||||||
|
direction = "rtl"
|
||||||
|
has_case = false
|
||||||
|
has_letters = true
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.he.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class HebrewDefaults(Language.Defaults):
|
class HebrewDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "he"
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
|
||||||
|
|
||||||
|
|
||||||
class Hebrew(Language):
|
class Hebrew(Language):
|
||||||
lang = "he"
|
lang = "he"
|
||||||
Defaults = HebrewDefaults
|
Defaults = HebrewDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Hebrew"]
|
__all__ = ["Hebrew"]
|
||||||
|
|
|
@ -1,20 +1,33 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class HindiDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lang = "hi"
|
||||||
lex_attr_getters[LANG] = lambda text: "hi"
|
stop_words = {"@language_data": "spacy.hi.stop_words"}
|
||||||
stop_words = STOP_WORDS
|
lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.hi.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.hi.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Hindi(Language):
|
class Hindi(Language):
|
||||||
lang = "hi"
|
lang = "hi"
|
||||||
Defaults = HindiDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Hindi"]
|
__all__ = ["Hindi"]
|
||||||
|
|
|
@ -1,25 +1,39 @@
|
||||||
from .stop_words import STOP_WORDS
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "hr"
|
||||||
|
stop_words = {"@language_data": "spacy.hr.stop_words"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.hr.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class CroatianDefaults(Language.Defaults):
|
class CroatianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "hr"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Croatian(Language):
|
class Croatian(Language):
|
||||||
lang = "hr"
|
lang = "hr"
|
||||||
Defaults = CroatianDefaults
|
Defaults = CroatianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Croatian"]
|
__all__ = ["Croatian"]
|
||||||
|
|
|
@ -1,22 +1,35 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "hu"
|
||||||
|
stop_words = {"@language_data": "spacy.hu.stop_words"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.hu.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class HungarianDefaults(Language.Defaults):
|
class HungarianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "hu"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
@ -26,6 +39,7 @@ class HungarianDefaults(Language.Defaults):
|
||||||
class Hungarian(Language):
|
class Hungarian(Language):
|
||||||
lang = "hu"
|
lang = "hu"
|
||||||
Defaults = HungarianDefaults
|
Defaults = HungarianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Hungarian"]
|
__all__ = ["Hungarian"]
|
||||||
|
|
|
@ -1,21 +1,33 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ...attrs import LANG
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class ArmenianDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "hy"
|
lang = "hy"
|
||||||
|
stop_words = {"@language_data": "spacy.hy.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
stop_words = STOP_WORDS
|
@registry.language_data("spacy.hy.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.hy.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Armenian(Language):
|
class Armenian(Language):
|
||||||
lang = "hy"
|
lang = "hy"
|
||||||
Defaults = ArmenianDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Armenian"]
|
__all__ = ["Armenian"]
|
||||||
|
|
|
@ -1,21 +1,43 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.config import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "id"
|
||||||
|
stop_words = {"@language_data": "spacy.id.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.id.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.id.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class IndonesianDefaults(Language.Defaults):
|
class IndonesianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "id"
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
@ -25,6 +47,7 @@ class IndonesianDefaults(Language.Defaults):
|
||||||
class Indonesian(Language):
|
class Indonesian(Language):
|
||||||
lang = "id"
|
lang = "id"
|
||||||
Defaults = IndonesianDefaults
|
Defaults = IndonesianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Indonesian"]
|
__all__ = ["Indonesian"]
|
||||||
|
|
|
@ -1,17 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class IcelandicDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "is"
|
lang = "is"
|
||||||
stop_words = STOP_WORDS
|
stop_words = {"@language_data": "spacy.is.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.is.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Icelandic(Language):
|
class Icelandic(Language):
|
||||||
lang = "is"
|
lang = "is"
|
||||||
Defaults = IcelandicDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Icelandic"]
|
__all__ = ["Icelandic"]
|
||||||
|
|
|
@ -1,20 +1,34 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "it"
|
||||||
|
stop_words = {"@language_data": "spacy.it.stop_words"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.it.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class ItalianDefaults(Language.Defaults):
|
class ItalianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "it"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
@ -24,6 +38,7 @@ class ItalianDefaults(Language.Defaults):
|
||||||
class Italian(Language):
|
class Italian(Language):
|
||||||
lang = "it"
|
lang = "it"
|
||||||
Defaults = ItalianDefaults
|
Defaults = ItalianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Italian"]
|
__all__ = ["Italian"]
|
||||||
|
|
|
@ -1,21 +1,187 @@
|
||||||
|
from typing import Optional, Union, Dict, Any, Set
|
||||||
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
from collections import namedtuple, OrderedDict
|
from collections import namedtuple
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .tag_orth_map import TAG_ORTH_MAP
|
from .tag_orth_map import TAG_ORTH_MAP
|
||||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||||
from ...attrs import LANG
|
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer
|
from ...util import DummyTokenizer, registry
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "ja"
|
||||||
|
stop_words = {"@language_data": "spacy.ja.stop_words"}
|
||||||
|
|
||||||
|
[nlp.tokenizer]
|
||||||
|
@tokenizers = "spacy.JapaneseTokenizer.v1"
|
||||||
|
split_mode = null
|
||||||
|
|
||||||
|
[nlp.writing_system]
|
||||||
|
direction = "ltr"
|
||||||
|
has_case = false
|
||||||
|
has_letters = false
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ja.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
|
||||||
|
def create_japanese_tokenizer(split_mode: Optional[str] = None):
|
||||||
|
def japanese_tokenizer_factory(nlp):
|
||||||
|
return JapaneseTokenizer(nlp, split_mode=split_mode)
|
||||||
|
|
||||||
|
return japanese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
|
class JapaneseTokenizer(DummyTokenizer):
|
||||||
|
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
|
||||||
|
self.vocab = nlp.vocab
|
||||||
|
self.split_mode = split_mode
|
||||||
|
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||||
|
|
||||||
|
def __call__(self, text: str) -> Doc:
|
||||||
|
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
||||||
|
sudachipy_tokens = self.tokenizer.tokenize(text)
|
||||||
|
dtokens = self._get_dtokens(sudachipy_tokens)
|
||||||
|
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
||||||
|
|
||||||
|
# create Doc with tag bi-gram based part-of-speech identification rules
|
||||||
|
words, tags, inflections, lemmas, readings, sub_tokens_list = (
|
||||||
|
zip(*dtokens) if dtokens else [[]] * 6
|
||||||
|
)
|
||||||
|
sub_tokens_list = list(sub_tokens_list)
|
||||||
|
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
next_pos = None # for bi-gram rules
|
||||||
|
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
|
||||||
|
token.tag_ = dtoken.tag
|
||||||
|
if next_pos: # already identified in previous iteration
|
||||||
|
token.pos = next_pos
|
||||||
|
next_pos = None
|
||||||
|
else:
|
||||||
|
token.pos, next_pos = resolve_pos(
|
||||||
|
token.orth_,
|
||||||
|
dtoken.tag,
|
||||||
|
tags[idx + 1] if idx + 1 < len(tags) else None,
|
||||||
|
)
|
||||||
|
# if there's no lemma info (it's an unk) just use the surface
|
||||||
|
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
||||||
|
doc.user_data["inflections"] = inflections
|
||||||
|
doc.user_data["reading_forms"] = readings
|
||||||
|
doc.user_data["sub_tokens"] = sub_tokens_list
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
|
||||||
|
sub_tokens_list = (
|
||||||
|
self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
|
||||||
|
)
|
||||||
|
dtokens = [
|
||||||
|
DetailedToken(
|
||||||
|
token.surface(), # orth
|
||||||
|
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
|
||||||
|
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
|
||||||
|
token.dictionary_form(), # lemma
|
||||||
|
token.reading_form(), # user_data['reading_forms']
|
||||||
|
sub_tokens_list[idx]
|
||||||
|
if sub_tokens_list
|
||||||
|
else None, # user_data['sub_tokens']
|
||||||
|
)
|
||||||
|
for idx, token in enumerate(sudachipy_tokens)
|
||||||
|
if len(token.surface()) > 0
|
||||||
|
# remove empty tokens which can be produced with characters like … that
|
||||||
|
]
|
||||||
|
# Sudachi normalizes internally and outputs each space char as a token.
|
||||||
|
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
|
||||||
|
return [
|
||||||
|
t
|
||||||
|
for idx, t in enumerate(dtokens)
|
||||||
|
if idx == 0
|
||||||
|
or not t.surface.isspace()
|
||||||
|
or t.tag != "空白"
|
||||||
|
or not dtokens[idx - 1].surface.isspace()
|
||||||
|
or dtokens[idx - 1].tag != "空白"
|
||||||
|
]
|
||||||
|
|
||||||
|
def _get_sub_tokens(self, sudachipy_tokens):
|
||||||
|
if (
|
||||||
|
self.split_mode is None or self.split_mode == "A"
|
||||||
|
): # do nothing for default split mode
|
||||||
|
return None
|
||||||
|
|
||||||
|
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
|
||||||
|
for token in sudachipy_tokens:
|
||||||
|
sub_a = token.split(self.tokenizer.SplitMode.A)
|
||||||
|
if len(sub_a) == 1: # no sub tokens
|
||||||
|
sub_tokens_list.append(None)
|
||||||
|
elif self.split_mode == "B":
|
||||||
|
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
|
||||||
|
else: # "C"
|
||||||
|
sub_b = token.split(self.tokenizer.SplitMode.B)
|
||||||
|
if len(sub_a) == len(sub_b):
|
||||||
|
dtokens = self._get_dtokens(sub_a, False)
|
||||||
|
sub_tokens_list.append([dtokens, dtokens])
|
||||||
|
else:
|
||||||
|
sub_tokens_list.append(
|
||||||
|
[
|
||||||
|
self._get_dtokens(sub_a, False),
|
||||||
|
self._get_dtokens(sub_b, False),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return sub_tokens_list
|
||||||
|
|
||||||
|
def _get_config(self) -> Dict[str, Any]:
|
||||||
|
return {"split_mode": self.split_mode}
|
||||||
|
|
||||||
|
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||||
|
self.split_mode = config.get("split_mode", None)
|
||||||
|
|
||||||
|
def to_bytes(self, **kwargs) -> bytes:
|
||||||
|
serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
|
||||||
|
return util.to_bytes(serializers, [])
|
||||||
|
|
||||||
|
def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer":
|
||||||
|
deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
|
||||||
|
util.from_bytes(data, deserializers, [])
|
||||||
|
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
|
||||||
|
return util.to_disk(path, serializers, [])
|
||||||
|
|
||||||
|
def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
|
||||||
|
util.from_disk(path, serializers, [])
|
||||||
|
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
class JapaneseDefaults(Language.Defaults):
|
||||||
|
tag_map = TAG_MAP
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
|
class Japanese(Language):
|
||||||
|
lang = "ja"
|
||||||
|
Defaults = JapaneseDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
# Hold the attributes we need with convenient names
|
# Hold the attributes we need with convenient names
|
||||||
DetailedToken = namedtuple(
|
DetailedToken = namedtuple(
|
||||||
"DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
|
"DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
|
||||||
|
@ -133,161 +299,6 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
||||||
return text_dtokens, text_spaces
|
return text_dtokens, text_spaces
|
||||||
|
|
||||||
|
|
||||||
class JapaneseTokenizer(DummyTokenizer):
|
|
||||||
def __init__(self, cls, nlp=None, config={}):
|
|
||||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
|
||||||
self.split_mode = config.get("split_mode", None)
|
|
||||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
|
||||||
|
|
||||||
def __call__(self, text):
|
|
||||||
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
|
||||||
sudachipy_tokens = self.tokenizer.tokenize(text)
|
|
||||||
dtokens = self._get_dtokens(sudachipy_tokens)
|
|
||||||
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
|
||||||
|
|
||||||
# create Doc with tag bi-gram based part-of-speech identification rules
|
|
||||||
words, tags, inflections, lemmas, readings, sub_tokens_list = (
|
|
||||||
zip(*dtokens) if dtokens else [[]] * 6
|
|
||||||
)
|
|
||||||
sub_tokens_list = list(sub_tokens_list)
|
|
||||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
|
||||||
next_pos = None # for bi-gram rules
|
|
||||||
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
|
|
||||||
token.tag_ = dtoken.tag
|
|
||||||
if next_pos: # already identified in previous iteration
|
|
||||||
token.pos = next_pos
|
|
||||||
next_pos = None
|
|
||||||
else:
|
|
||||||
token.pos, next_pos = resolve_pos(
|
|
||||||
token.orth_,
|
|
||||||
dtoken.tag,
|
|
||||||
tags[idx + 1] if idx + 1 < len(tags) else None,
|
|
||||||
)
|
|
||||||
# if there's no lemma info (it's an unk) just use the surface
|
|
||||||
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
|
||||||
|
|
||||||
doc.user_data["inflections"] = inflections
|
|
||||||
doc.user_data["reading_forms"] = readings
|
|
||||||
doc.user_data["sub_tokens"] = sub_tokens_list
|
|
||||||
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
|
|
||||||
sub_tokens_list = (
|
|
||||||
self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
|
|
||||||
)
|
|
||||||
dtokens = [
|
|
||||||
DetailedToken(
|
|
||||||
token.surface(), # orth
|
|
||||||
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
|
|
||||||
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
|
|
||||||
token.dictionary_form(), # lemma
|
|
||||||
token.reading_form(), # user_data['reading_forms']
|
|
||||||
sub_tokens_list[idx]
|
|
||||||
if sub_tokens_list
|
|
||||||
else None, # user_data['sub_tokens']
|
|
||||||
)
|
|
||||||
for idx, token in enumerate(sudachipy_tokens)
|
|
||||||
if len(token.surface()) > 0
|
|
||||||
# remove empty tokens which can be produced with characters like … that
|
|
||||||
]
|
|
||||||
# Sudachi normalizes internally and outputs each space char as a token.
|
|
||||||
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
|
|
||||||
return [
|
|
||||||
t
|
|
||||||
for idx, t in enumerate(dtokens)
|
|
||||||
if idx == 0
|
|
||||||
or not t.surface.isspace()
|
|
||||||
or t.tag != "空白"
|
|
||||||
or not dtokens[idx - 1].surface.isspace()
|
|
||||||
or dtokens[idx - 1].tag != "空白"
|
|
||||||
]
|
|
||||||
|
|
||||||
def _get_sub_tokens(self, sudachipy_tokens):
|
|
||||||
if (
|
|
||||||
self.split_mode is None or self.split_mode == "A"
|
|
||||||
): # do nothing for default split mode
|
|
||||||
return None
|
|
||||||
|
|
||||||
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
|
|
||||||
for token in sudachipy_tokens:
|
|
||||||
sub_a = token.split(self.tokenizer.SplitMode.A)
|
|
||||||
if len(sub_a) == 1: # no sub tokens
|
|
||||||
sub_tokens_list.append(None)
|
|
||||||
elif self.split_mode == "B":
|
|
||||||
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
|
|
||||||
else: # "C"
|
|
||||||
sub_b = token.split(self.tokenizer.SplitMode.B)
|
|
||||||
if len(sub_a) == len(sub_b):
|
|
||||||
dtokens = self._get_dtokens(sub_a, False)
|
|
||||||
sub_tokens_list.append([dtokens, dtokens])
|
|
||||||
else:
|
|
||||||
sub_tokens_list.append(
|
|
||||||
[
|
|
||||||
self._get_dtokens(sub_a, False),
|
|
||||||
self._get_dtokens(sub_b, False),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
return sub_tokens_list
|
|
||||||
|
|
||||||
def _get_config(self):
|
|
||||||
config = OrderedDict((("split_mode", self.split_mode),))
|
|
||||||
return config
|
|
||||||
|
|
||||||
def _set_config(self, config={}):
|
|
||||||
self.split_mode = config.get("split_mode", None)
|
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
|
||||||
serializers = OrderedDict(
|
|
||||||
(("cfg", lambda: srsly.json_dumps(self._get_config())),)
|
|
||||||
)
|
|
||||||
return util.to_bytes(serializers, [])
|
|
||||||
|
|
||||||
def from_bytes(self, data, **kwargs):
|
|
||||||
deserializers = OrderedDict(
|
|
||||||
(("cfg", lambda b: self._set_config(srsly.json_loads(b))),)
|
|
||||||
)
|
|
||||||
util.from_bytes(data, deserializers, [])
|
|
||||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def to_disk(self, path, **kwargs):
|
|
||||||
path = util.ensure_path(path)
|
|
||||||
serializers = OrderedDict(
|
|
||||||
(("cfg", lambda p: srsly.write_json(p, self._get_config())),)
|
|
||||||
)
|
|
||||||
return util.to_disk(path, serializers, [])
|
|
||||||
|
|
||||||
def from_disk(self, path, **kwargs):
|
|
||||||
path = util.ensure_path(path)
|
|
||||||
serializers = OrderedDict(
|
|
||||||
(("cfg", lambda p: self._set_config(srsly.read_json(p))),)
|
|
||||||
)
|
|
||||||
util.from_disk(path, serializers, [])
|
|
||||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda _text: "ja"
|
|
||||||
stop_words = STOP_WORDS
|
|
||||||
tag_map = TAG_MAP
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_tokenizer(cls, nlp=None, config={}):
|
|
||||||
return JapaneseTokenizer(cls, nlp, config)
|
|
||||||
|
|
||||||
|
|
||||||
class Japanese(Language):
|
|
||||||
lang = "ja"
|
|
||||||
Defaults = JapaneseDefaults
|
|
||||||
|
|
||||||
def make_doc(self, text):
|
|
||||||
return self.tokenizer(text)
|
|
||||||
|
|
||||||
|
|
||||||
def pickle_japanese(instance):
|
def pickle_japanese(instance):
|
||||||
return Japanese, tuple()
|
return Japanese, tuple()
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class KannadaDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "kn"
|
lang = "kn"
|
||||||
stop_words = STOP_WORDS
|
stop_words = {"@language_data": "spacy.kn.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.kn.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Kannada(Language):
|
class Kannada(Language):
|
||||||
lang = "kn"
|
lang = "kn"
|
||||||
Defaults = KannadaDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Kannada"]
|
__all__ = ["Kannada"]
|
||||||
|
|
|
@ -1,51 +1,52 @@
|
||||||
|
from typing import Set, Optional, Any, Dict
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from ...attrs import LANG
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
from ...util import DummyTokenizer
|
from ...util import DummyTokenizer, registry
|
||||||
|
|
||||||
|
|
||||||
def try_mecab_import():
|
DEFAULT_CONFIG = """
|
||||||
try:
|
[nlp]
|
||||||
from natto import MeCab
|
lang = "ko"
|
||||||
|
stop_words = {"@language_data": "spacy.ko.stop_words"}
|
||||||
|
|
||||||
return MeCab
|
[nlp.tokenizer]
|
||||||
except ImportError:
|
@tokenizers = "spacy.KoreanTokenizer.v1"
|
||||||
raise ImportError(
|
|
||||||
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
[nlp.writing_system]
|
||||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
direction = "ltr"
|
||||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
has_case = false
|
||||||
)
|
has_letters = false
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
# fmt: on
|
@registry.language_data("spacy.ko.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
def check_spaces(text, tokens):
|
@registry.tokenizers("spacy.KoreanTokenizer.v1")
|
||||||
prev_end = -1
|
def create_korean_tokenizer():
|
||||||
start = 0
|
def korean_tokenizer_factory(nlp):
|
||||||
for token in tokens:
|
return KoreanTokenizer(nlp)
|
||||||
idx = text.find(token, start)
|
|
||||||
if prev_end > 0:
|
return korean_tokenizer_factory
|
||||||
yield prev_end != idx
|
|
||||||
prev_end = idx + len(token)
|
|
||||||
start = prev_end
|
|
||||||
if start > 0:
|
|
||||||
yield False
|
|
||||||
|
|
||||||
|
|
||||||
class KoreanTokenizer(DummyTokenizer):
|
class KoreanTokenizer(DummyTokenizer):
|
||||||
def __init__(self, cls, nlp=None):
|
def __init__(self, nlp: Optional[Language] = None):
|
||||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
self.vocab = nlp.vocab
|
||||||
MeCab = try_mecab_import()
|
MeCab = try_mecab_import()
|
||||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.mecab_tokenizer.__del__()
|
self.mecab_tokenizer.__del__()
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text: str) -> Doc:
|
||||||
dtokens = list(self.detailed_tokens(text))
|
dtokens = list(self.detailed_tokens(text))
|
||||||
surfaces = [dt["surface"] for dt in dtokens]
|
surfaces = [dt["surface"] for dt in dtokens]
|
||||||
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
|
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
|
||||||
|
@ -56,7 +57,7 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def detailed_tokens(self, text):
|
def detailed_tokens(self, text: str) -> Dict[str, Any]:
|
||||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||||
|
@ -72,23 +73,39 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
|
|
||||||
class KoreanDefaults(Language.Defaults):
|
class KoreanDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda _text: "ko"
|
|
||||||
stop_words = STOP_WORDS
|
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_tokenizer(cls, nlp=None):
|
|
||||||
return KoreanTokenizer(cls, nlp)
|
|
||||||
|
|
||||||
|
|
||||||
class Korean(Language):
|
class Korean(Language):
|
||||||
lang = "ko"
|
lang = "ko"
|
||||||
Defaults = KoreanDefaults
|
Defaults = KoreanDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
def make_doc(self, text):
|
|
||||||
return self.tokenizer(text)
|
def try_mecab_import() -> None:
|
||||||
|
try:
|
||||||
|
from natto import MeCab
|
||||||
|
|
||||||
|
return MeCab
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||||
|
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||||
|
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def check_spaces(text, tokens):
|
||||||
|
prev_end = -1
|
||||||
|
start = 0
|
||||||
|
for token in tokens:
|
||||||
|
idx = text.find(token, start)
|
||||||
|
if prev_end > 0:
|
||||||
|
yield prev_end != idx
|
||||||
|
prev_end = idx + len(token)
|
||||||
|
start = prev_end
|
||||||
|
if start > 0:
|
||||||
|
yield False
|
||||||
|
|
||||||
|
|
||||||
def pickle_korean(instance):
|
def pickle_korean(instance):
|
||||||
|
|
|
@ -1,26 +1,49 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "lb"
|
||||||
|
stop_words = {"@language_data": "spacy.lb.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.lb.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.lb.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class LuxembourgishDefaults(Language.Defaults):
|
class LuxembourgishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "lb"
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class Luxembourgish(Language):
|
class Luxembourgish(Language):
|
||||||
lang = "lb"
|
lang = "lb"
|
||||||
Defaults = LuxembourgishDefaults
|
Defaults = LuxembourgishDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Luxembourgish"]
|
__all__ = ["Luxembourgish"]
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from typing import Set
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
@ -21,21 +22,21 @@ _tlds = set(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_punct(text):
|
def is_punct(text: str) -> bool:
|
||||||
for char in text:
|
for char in text:
|
||||||
if not unicodedata.category(char).startswith("P"):
|
if not unicodedata.category(char).startswith("P"):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def is_ascii(text):
|
def is_ascii(text: str) -> bool:
|
||||||
for char in text:
|
for char in text:
|
||||||
if ord(char) >= 128:
|
if ord(char) >= 128:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text: str) -> bool:
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
# can be overwritten by lang with list of number words
|
# can be overwritten by lang with list of number words
|
||||||
|
@ -49,64 +50,31 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def is_bracket(text):
|
def is_bracket(text: str) -> bool:
|
||||||
brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
|
brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
|
||||||
return text in brackets
|
return text in brackets
|
||||||
|
|
||||||
|
|
||||||
def is_quote(text):
|
def is_quote(text: str) -> bool:
|
||||||
quotes = (
|
# fmt: off
|
||||||
'"',
|
quotes = ('"', "'", "`", "«", "»", "‘", "’", "‚", "‛", "“", "”", "„", "‟", "‹", "›", "❮", "❯", "''", "``")
|
||||||
"'",
|
# fmt: on
|
||||||
"`",
|
|
||||||
"«",
|
|
||||||
"»",
|
|
||||||
"‘",
|
|
||||||
"’",
|
|
||||||
"‚",
|
|
||||||
"‛",
|
|
||||||
"“",
|
|
||||||
"”",
|
|
||||||
"„",
|
|
||||||
"‟",
|
|
||||||
"‹",
|
|
||||||
"›",
|
|
||||||
"❮",
|
|
||||||
"❯",
|
|
||||||
"''",
|
|
||||||
"``",
|
|
||||||
)
|
|
||||||
return text in quotes
|
return text in quotes
|
||||||
|
|
||||||
|
|
||||||
def is_left_punct(text):
|
def is_left_punct(text: str) -> bool:
|
||||||
left_punct = (
|
# fmt: off
|
||||||
"(",
|
left_punct = ("(", "[", "{", "<", '"', "'", "«", "‘", "‚", "‛", "“", "„", "‟", "‹", "❮", "``")
|
||||||
"[",
|
# fmt: on
|
||||||
"{",
|
|
||||||
"<",
|
|
||||||
'"',
|
|
||||||
"'",
|
|
||||||
"«",
|
|
||||||
"‘",
|
|
||||||
"‚",
|
|
||||||
"‛",
|
|
||||||
"“",
|
|
||||||
"„",
|
|
||||||
"‟",
|
|
||||||
"‹",
|
|
||||||
"❮",
|
|
||||||
"``",
|
|
||||||
)
|
|
||||||
return text in left_punct
|
return text in left_punct
|
||||||
|
|
||||||
|
|
||||||
def is_right_punct(text):
|
def is_right_punct(text: str) -> bool:
|
||||||
right_punct = (")", "]", "}", ">", '"', "'", "»", "’", "”", "›", "❯", "''")
|
right_punct = (")", "]", "}", ">", '"', "'", "»", "’", "”", "›", "❯", "''")
|
||||||
return text in right_punct
|
return text in right_punct
|
||||||
|
|
||||||
|
|
||||||
def is_currency(text):
|
def is_currency(text: str) -> bool:
|
||||||
# can be overwritten by lang with list of currency words, e.g. dollar, euro
|
# can be overwritten by lang with list of currency words, e.g. dollar, euro
|
||||||
for char in text:
|
for char in text:
|
||||||
if unicodedata.category(char) != "Sc":
|
if unicodedata.category(char) != "Sc":
|
||||||
|
@ -114,11 +82,11 @@ def is_currency(text):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def like_email(text):
|
def like_email(text: str) -> bool:
|
||||||
return bool(_like_email(text))
|
return bool(_like_email(text))
|
||||||
|
|
||||||
|
|
||||||
def like_url(text):
|
def like_url(text: str) -> bool:
|
||||||
# We're looking for things that function in text like URLs. So, valid URL
|
# We're looking for things that function in text like URLs. So, valid URL
|
||||||
# or not, anything they say http:// is going to be good.
|
# or not, anything they say http:// is going to be good.
|
||||||
if text.startswith("http://") or text.startswith("https://"):
|
if text.startswith("http://") or text.startswith("https://"):
|
||||||
|
@ -144,7 +112,7 @@ def like_url(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def word_shape(text):
|
def word_shape(text: str) -> str:
|
||||||
if len(text) >= 100:
|
if len(text) >= 100:
|
||||||
return "LONG"
|
return "LONG"
|
||||||
shape = []
|
shape = []
|
||||||
|
@ -171,46 +139,52 @@ def word_shape(text):
|
||||||
return "".join(shape)
|
return "".join(shape)
|
||||||
|
|
||||||
|
|
||||||
def lower(string):
|
def lower(string: str) -> str:
|
||||||
return string.lower()
|
return string.lower()
|
||||||
|
|
||||||
|
|
||||||
def prefix(string):
|
def prefix(string: str) -> str:
|
||||||
return string[0]
|
return string[0]
|
||||||
|
|
||||||
|
|
||||||
def suffix(string):
|
def suffix(string: str) -> str:
|
||||||
return string[-3:]
|
return string[-3:]
|
||||||
|
|
||||||
|
|
||||||
def is_alpha(string):
|
def is_alpha(string: str) -> bool:
|
||||||
return string.isalpha()
|
return string.isalpha()
|
||||||
|
|
||||||
|
|
||||||
def is_digit(string):
|
def is_digit(string: str) -> bool:
|
||||||
return string.isdigit()
|
return string.isdigit()
|
||||||
|
|
||||||
|
|
||||||
def is_lower(string):
|
def is_lower(string: str) -> bool:
|
||||||
return string.islower()
|
return string.islower()
|
||||||
|
|
||||||
|
|
||||||
def is_space(string):
|
def is_space(string: str) -> bool:
|
||||||
return string.isspace()
|
return string.isspace()
|
||||||
|
|
||||||
|
|
||||||
def is_title(string):
|
def is_title(string: str) -> bool:
|
||||||
return string.istitle()
|
return string.istitle()
|
||||||
|
|
||||||
|
|
||||||
def is_upper(string):
|
def is_upper(string: str) -> bool:
|
||||||
return string.isupper()
|
return string.isupper()
|
||||||
|
|
||||||
|
|
||||||
def is_stop(string, stops=set()):
|
def is_stop(string: str, stops: Set[str] = set()) -> bool:
|
||||||
return string.lower() in stops
|
return string.lower() in stops
|
||||||
|
|
||||||
|
|
||||||
|
def get_lang(text: str, lang: str = "") -> str:
|
||||||
|
# This function is partially applied so lang code can be passed in
|
||||||
|
# automatically while still allowing pickling
|
||||||
|
return lang
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {
|
||||||
attrs.LOWER: lower,
|
attrs.LOWER: lower,
|
||||||
attrs.NORM: lower,
|
attrs.NORM: lower,
|
||||||
|
|
|
@ -1,28 +1,35 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "lij"
|
||||||
|
stop_words = {"@language_data": "spacy.lij.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.lij.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class LigurianDefaults(Language.Defaults):
|
class LigurianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "lij"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class Ligurian(Language):
|
class Ligurian(Language):
|
||||||
lang = "lij"
|
lang = "lij"
|
||||||
Defaults = LigurianDefaults
|
Defaults = LigurianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Ligurian"]
|
__all__ = ["Ligurian"]
|
||||||
|
|
|
@ -1,27 +1,41 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
|
||||||
def _return_lt(_):
|
DEFAULT_CONFIG = """
|
||||||
return "lt"
|
[nlp]
|
||||||
|
lang = "lt"
|
||||||
|
stop_words = {"@language_data": "spacy.lt.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.lt.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.lt.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class LithuanianDefaults(Language.Defaults):
|
class LithuanianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = _return_lt
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
|
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
mod_base_exceptions = {
|
mod_base_exceptions = {
|
||||||
|
@ -29,12 +43,12 @@ class LithuanianDefaults(Language.Defaults):
|
||||||
}
|
}
|
||||||
del mod_base_exceptions["8)"]
|
del mod_base_exceptions["8)"]
|
||||||
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Lithuanian(Language):
|
class Lithuanian(Language):
|
||||||
lang = "lt"
|
lang = "lt"
|
||||||
Defaults = LithuanianDefaults
|
Defaults = LithuanianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Lithuanian"]
|
__all__ = ["Lithuanian"]
|
||||||
|
|
|
@ -1,17 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class LatvianDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "lv"
|
lang = "lv"
|
||||||
stop_words = STOP_WORDS
|
stop_words = {"@language_data": "spacy.lv.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.lv.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Latvian(Language):
|
class Latvian(Language):
|
||||||
lang = "lv"
|
lang = "lv"
|
||||||
Defaults = LatvianDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Latvian"]
|
__all__ = ["Latvian"]
|
||||||
|
|
|
@ -1,15 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class MalayalamDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
stop_words = STOP_WORDS
|
[nlp]
|
||||||
|
lang = "ml"
|
||||||
|
stop_words = {"@language_data": "spacy.ml.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ml.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Malayalam(Language):
|
class Malayalam(Language):
|
||||||
lang = "ml"
|
lang = "ml"
|
||||||
Defaults = MalayalamDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Malayalam"]
|
__all__ = ["Malayalam"]
|
||||||
|
|
|
@ -1,17 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class MarathiDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "mr"
|
lang = "af"
|
||||||
stop_words = STOP_WORDS
|
stop_words = {"@language_data": "spacy.mr.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.mr.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Marathi(Language):
|
class Marathi(Language):
|
||||||
lang = "mr"
|
lang = "mr"
|
||||||
Defaults = MarathiDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Marathi"]
|
__all__ = ["Marathi"]
|
||||||
|
|
|
@ -1,33 +1,47 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "nb"
|
||||||
|
stop_words = {"@language_data": "spacy.nb.stop_words"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.nb.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class NorwegianDefaults(Language.Defaults):
|
class NorwegianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "nb"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Norwegian(Language):
|
class Norwegian(Language):
|
||||||
lang = "nb"
|
lang = "nb"
|
||||||
Defaults = NorwegianDefaults
|
Defaults = NorwegianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Norwegian"]
|
__all__ = ["Norwegian"]
|
||||||
|
|
|
@ -1,23 +1,33 @@
|
||||||
# coding: utf8
|
from typing import Set, Dict, Callable, Any
|
||||||
from __future__ import unicode_literals
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class NepaliDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lang = "ne"
|
||||||
lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code
|
stop_words = {"@language_data": "spacy.ne.stop_words"}
|
||||||
stop_words = STOP_WORDS
|
lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ne.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ne.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Nepali(Language):
|
class Nepali(Language):
|
||||||
lang = "ne"
|
lang = "ne"
|
||||||
Defaults = NepaliDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Nepali"]
|
__all__ = ["Nepali"]
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
@ -5,36 +8,51 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .lemmatizer import DutchLemmatizer
|
from .lemmatizer import DutchLemmatizer
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
from ...util import update_exc, registry
|
||||||
from ...attrs import LANG, NORM
|
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "nl"
|
||||||
|
stop_words = {"@language_data": "spacy.nl.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.DutchLemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.nl.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.nl.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
|
||||||
|
def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
|
||||||
|
return DutchLemmatizer(data_paths=data_paths)
|
||||||
|
|
||||||
|
|
||||||
class DutchDefaults(Language.Defaults):
|
class DutchDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "nl"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
|
||||||
if lookups is None:
|
|
||||||
lookups = Lookups()
|
|
||||||
return DutchLemmatizer(lookups)
|
|
||||||
|
|
||||||
|
|
||||||
class Dutch(Language):
|
class Dutch(Language):
|
||||||
lang = "nl"
|
lang = "nl"
|
||||||
Defaults = DutchDefaults
|
Defaults = DutchDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Dutch"]
|
__all__ = ["Dutch"]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from typing import Optional, List, Dict, Tuple
|
||||||
|
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
||||||
|
|
||||||
|
@ -34,7 +36,9 @@ class DutchLemmatizer(Lemmatizer):
|
||||||
"num": "num",
|
"num": "num",
|
||||||
}
|
}
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(
|
||||||
|
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||||
|
) -> List[str]:
|
||||||
# Difference 1: self.rules is assumed to be non-None, so no
|
# Difference 1: self.rules is assumed to be non-None, so no
|
||||||
# 'is None' check required.
|
# 'is None' check required.
|
||||||
# String lowercased from the get-go. All lemmatization results in
|
# String lowercased from the get-go. All lemmatization results in
|
||||||
|
@ -92,7 +96,7 @@ class DutchLemmatizer(Lemmatizer):
|
||||||
# Overrides parent method so that a lowercased version of the string is
|
# Overrides parent method so that a lowercased version of the string is
|
||||||
# used to search the lookup table. This is necessary because our lookup
|
# used to search the lookup table. This is necessary because our lookup
|
||||||
# table consists entirely of lowercase keys.
|
# table consists entirely of lowercase keys.
|
||||||
def lookup(self, string, orth=None):
|
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
if orth is not None:
|
if orth is not None:
|
||||||
|
@ -102,7 +106,13 @@ class DutchLemmatizer(Lemmatizer):
|
||||||
|
|
||||||
# Reimplemented to focus more on application of suffix rules and to return
|
# Reimplemented to focus more on application of suffix rules and to return
|
||||||
# as early as possible.
|
# as early as possible.
|
||||||
def lemmatize(self, string, index, exceptions, rules):
|
def lemmatize(
|
||||||
|
self,
|
||||||
|
string: str,
|
||||||
|
index: Dict[str, List[str]],
|
||||||
|
exceptions: Dict[str, Dict[str, List[str]]],
|
||||||
|
rules: Dict[str, List[List[str]]],
|
||||||
|
) -> Tuple[List[str], bool]:
|
||||||
# returns (forms, is_known: bool)
|
# returns (forms, is_known: bool)
|
||||||
oov_forms = []
|
oov_forms = []
|
||||||
for old, new in rules:
|
for old, new in rules:
|
||||||
|
|
|
@ -1,43 +1,60 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import PolishLemmatizer
|
from .lemmatizer import PolishLemmatizer
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import registry
|
||||||
from ...util import add_lookups
|
|
||||||
from ...lookups import Lookups
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "pl"
|
||||||
|
stop_words = {"@language_data": "spacy.pl.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.PolishLemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.pl.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.pl.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
|
||||||
|
def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
|
||||||
|
return PolishLemmatizer(data_paths=data_paths)
|
||||||
|
|
||||||
|
|
||||||
class PolishDefaults(Language.Defaults):
|
class PolishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "pl"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
mod_base_exceptions = {
|
mod_base_exceptions = {
|
||||||
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||||
}
|
}
|
||||||
tokenizer_exceptions = mod_base_exceptions
|
tokenizer_exceptions = mod_base_exceptions
|
||||||
stop_words = STOP_WORDS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
|
||||||
if lookups is None:
|
|
||||||
lookups = Lookups()
|
|
||||||
return PolishLemmatizer(lookups)
|
|
||||||
|
|
||||||
|
|
||||||
class Polish(Language):
|
class Polish(Language):
|
||||||
lang = "pl"
|
lang = "pl"
|
||||||
Defaults = PolishDefaults
|
Defaults = PolishDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Polish"]
|
__all__ = ["Polish"]
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from typing import Optional, List, Dict
|
||||||
|
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
from ...parts_of_speech import NAMES
|
from ...parts_of_speech import NAMES
|
||||||
|
|
||||||
|
@ -7,30 +9,29 @@ class PolishLemmatizer(Lemmatizer):
|
||||||
# dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
|
# dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
|
||||||
# It utilizes some prefix based improvements for verb and adjectives
|
# It utilizes some prefix based improvements for verb and adjectives
|
||||||
# lemmatization, as well as case-sensitive lemmatization for nouns.
|
# lemmatization, as well as case-sensitive lemmatization for nouns.
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(
|
||||||
|
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||||
|
) -> List[str]:
|
||||||
if isinstance(univ_pos, int):
|
if isinstance(univ_pos, int):
|
||||||
univ_pos = NAMES.get(univ_pos, "X")
|
univ_pos = NAMES.get(univ_pos, "X")
|
||||||
univ_pos = univ_pos.upper()
|
univ_pos = univ_pos.upper()
|
||||||
|
|
||||||
lookup_pos = univ_pos.lower()
|
lookup_pos = univ_pos.lower()
|
||||||
if univ_pos == "PROPN":
|
if univ_pos == "PROPN":
|
||||||
lookup_pos = "noun"
|
lookup_pos = "noun"
|
||||||
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
|
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
|
||||||
|
|
||||||
if univ_pos == "NOUN":
|
if univ_pos == "NOUN":
|
||||||
return self.lemmatize_noun(string, morphology, lookup_table)
|
return self.lemmatize_noun(string, morphology, lookup_table)
|
||||||
|
|
||||||
if univ_pos != "PROPN":
|
if univ_pos != "PROPN":
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
|
|
||||||
if univ_pos == "ADJ":
|
if univ_pos == "ADJ":
|
||||||
return self.lemmatize_adj(string, morphology, lookup_table)
|
return self.lemmatize_adj(string, morphology, lookup_table)
|
||||||
elif univ_pos == "VERB":
|
elif univ_pos == "VERB":
|
||||||
return self.lemmatize_verb(string, morphology, lookup_table)
|
return self.lemmatize_verb(string, morphology, lookup_table)
|
||||||
|
|
||||||
return [lookup_table.get(string, string.lower())]
|
return [lookup_table.get(string, string.lower())]
|
||||||
|
|
||||||
def lemmatize_adj(self, string, morphology, lookup_table):
|
def lemmatize_adj(
|
||||||
|
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||||
|
) -> List[str]:
|
||||||
# this method utilizes different procedures for adjectives
|
# this method utilizes different procedures for adjectives
|
||||||
# with 'nie' and 'naj' prefixes
|
# with 'nie' and 'naj' prefixes
|
||||||
if string[:3] == "nie":
|
if string[:3] == "nie":
|
||||||
|
@ -41,25 +42,26 @@ class PolishLemmatizer(Lemmatizer):
|
||||||
return [lookup_table[naj_search_string]]
|
return [lookup_table[naj_search_string]]
|
||||||
if search_string in lookup_table:
|
if search_string in lookup_table:
|
||||||
return [lookup_table[search_string]]
|
return [lookup_table[search_string]]
|
||||||
|
|
||||||
if string[:3] == "naj":
|
if string[:3] == "naj":
|
||||||
naj_search_string = string[3:]
|
naj_search_string = string[3:]
|
||||||
if naj_search_string in lookup_table:
|
if naj_search_string in lookup_table:
|
||||||
return [lookup_table[naj_search_string]]
|
return [lookup_table[naj_search_string]]
|
||||||
|
|
||||||
return [lookup_table.get(string, string)]
|
return [lookup_table.get(string, string)]
|
||||||
|
|
||||||
def lemmatize_verb(self, string, morphology, lookup_table):
|
def lemmatize_verb(
|
||||||
|
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||||
|
) -> List[str]:
|
||||||
# this method utilizes a different procedure for verbs
|
# this method utilizes a different procedure for verbs
|
||||||
# with 'nie' prefix
|
# with 'nie' prefix
|
||||||
if string[:3] == "nie":
|
if string[:3] == "nie":
|
||||||
search_string = string[3:]
|
search_string = string[3:]
|
||||||
if search_string in lookup_table:
|
if search_string in lookup_table:
|
||||||
return [lookup_table[search_string]]
|
return [lookup_table[search_string]]
|
||||||
|
|
||||||
return [lookup_table.get(string, string)]
|
return [lookup_table.get(string, string)]
|
||||||
|
|
||||||
def lemmatize_noun(self, string, morphology, lookup_table):
|
def lemmatize_noun(
|
||||||
|
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||||
|
) -> List[str]:
|
||||||
# this method is case-sensitive, in order to work
|
# this method is case-sensitive, in order to work
|
||||||
# for incorrectly tagged proper names
|
# for incorrectly tagged proper names
|
||||||
if string != string.lower():
|
if string != string.lower():
|
||||||
|
@ -68,11 +70,16 @@ class PolishLemmatizer(Lemmatizer):
|
||||||
elif string in lookup_table:
|
elif string in lookup_table:
|
||||||
return [lookup_table[string]]
|
return [lookup_table[string]]
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
|
|
||||||
return [lookup_table.get(string, string)]
|
return [lookup_table.get(string, string)]
|
||||||
|
|
||||||
def lookup(self, string, orth=None):
|
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||||
return string.lower()
|
return string.lower()
|
||||||
|
|
||||||
def lemmatize(self, string, index, exceptions, rules):
|
def lemmatize(
|
||||||
|
self,
|
||||||
|
string: str,
|
||||||
|
index: Dict[str, List[str]],
|
||||||
|
exceptions: Dict[str, Dict[str, List[str]]],
|
||||||
|
rules: Dict[str, List[List[str]]],
|
||||||
|
) -> List[str]:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
|
@ -1,20 +1,42 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "pt"
|
||||||
|
stop_words = {"@language_data": "spacy.pt.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.pt.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.pt.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class PortugueseDefaults(Language.Defaults):
|
class PortugueseDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "pt"
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
@ -22,6 +44,7 @@ class PortugueseDefaults(Language.Defaults):
|
||||||
class Portuguese(Language):
|
class Portuguese(Language):
|
||||||
lang = "pt"
|
lang = "pt"
|
||||||
Defaults = PortugueseDefaults
|
Defaults = PortugueseDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Portuguese"]
|
__all__ = ["Portuguese"]
|
||||||
|
|
|
@ -1,27 +1,40 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
# Lemma data note:
|
# Lemma data note:
|
||||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||||
# Replaced characters using cedillas with the correct ones (ș and ț)
|
# Replaced characters using cedillas with the correct ones (ș and ț)
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "ro"
|
||||||
|
stop_words = {"@language_data": "spacy.ro.stop_words"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ro.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class RomanianDefaults(Language.Defaults):
|
class RomanianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "ro"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
@ -30,6 +43,7 @@ class RomanianDefaults(Language.Defaults):
|
||||||
class Romanian(Language):
|
class Romanian(Language):
|
||||||
lang = "ro"
|
lang = "ro"
|
||||||
Defaults = RomanianDefaults
|
Defaults = RomanianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Romanian"]
|
__all__ = ["Romanian"]
|
||||||
|
|
|
@ -1,32 +1,49 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import RussianLemmatizer
|
from .lemmatizer import RussianLemmatizer
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...util import update_exc
|
from ...util import update_exc, registry
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...attrs import LANG
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "ru"
|
||||||
|
stop_words = {"@language_data": "spacy.ru.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.RussianLemmatizer.v1"
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ru.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ru.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
|
||||||
|
def create_russian_lemmatizer() -> RussianLemmatizer:
|
||||||
|
return RussianLemmatizer()
|
||||||
|
|
||||||
|
|
||||||
class RussianDefaults(Language.Defaults):
|
class RussianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "ru"
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
|
||||||
if lookups is None:
|
|
||||||
lookups = Lookups()
|
|
||||||
return RussianLemmatizer(lookups)
|
|
||||||
|
|
||||||
|
|
||||||
class Russian(Language):
|
class Russian(Language):
|
||||||
lang = "ru"
|
lang = "ru"
|
||||||
Defaults = RussianDefaults
|
Defaults = RussianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Russian"]
|
__all__ = ["Russian"]
|
||||||
|
|
|
@ -1,11 +1,17 @@
|
||||||
|
from typing import Optional, Tuple, Dict, List
|
||||||
|
|
||||||
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
|
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
|
from ...lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
|
PUNCT_RULES = {"«": '"', "»": '"'}
|
||||||
|
|
||||||
|
|
||||||
class RussianLemmatizer(Lemmatizer):
|
class RussianLemmatizer(Lemmatizer):
|
||||||
_morph = None
|
_morph = None
|
||||||
|
|
||||||
def __init__(self, lookups=None):
|
def __init__(self, lookups: Optional[Lookups] = None) -> None:
|
||||||
super(RussianLemmatizer, self).__init__(lookups)
|
super(RussianLemmatizer, self).__init__(lookups)
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
|
@ -19,15 +25,15 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
if RussianLemmatizer._morph is None:
|
if RussianLemmatizer._morph is None:
|
||||||
RussianLemmatizer._morph = MorphAnalyzer()
|
RussianLemmatizer._morph = MorphAnalyzer()
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(
|
||||||
|
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||||
|
) -> List[str]:
|
||||||
univ_pos = self.normalize_univ_pos(univ_pos)
|
univ_pos = self.normalize_univ_pos(univ_pos)
|
||||||
if univ_pos == "PUNCT":
|
if univ_pos == "PUNCT":
|
||||||
return [PUNCT_RULES.get(string, string)]
|
return [PUNCT_RULES.get(string, string)]
|
||||||
|
|
||||||
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
||||||
# Skip unchangeable pos
|
# Skip unchangeable pos
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
|
|
||||||
analyses = self._morph.parse(string)
|
analyses = self._morph.parse(string)
|
||||||
filtered_analyses = []
|
filtered_analyses = []
|
||||||
for analysis in analyses:
|
for analysis in analyses:
|
||||||
|
@ -39,12 +45,10 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
||||||
):
|
):
|
||||||
filtered_analyses.append(analysis)
|
filtered_analyses.append(analysis)
|
||||||
|
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
||||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||||
|
|
||||||
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
||||||
features_to_compare = ["Case", "Number", "Gender"]
|
features_to_compare = ["Case", "Number", "Gender"]
|
||||||
elif univ_pos == "NUM":
|
elif univ_pos == "NUM":
|
||||||
|
@ -61,7 +65,6 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
"VerbForm",
|
"VerbForm",
|
||||||
"Voice",
|
"Voice",
|
||||||
]
|
]
|
||||||
|
|
||||||
analyses, filtered_analyses = filtered_analyses, []
|
analyses, filtered_analyses = filtered_analyses, []
|
||||||
for analysis in analyses:
|
for analysis in analyses:
|
||||||
_, analysis_morph = oc2ud(str(analysis.tag))
|
_, analysis_morph = oc2ud(str(analysis.tag))
|
||||||
|
@ -74,16 +77,14 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
filtered_analyses.append(analysis)
|
filtered_analyses.append(analysis)
|
||||||
|
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def normalize_univ_pos(univ_pos):
|
def normalize_univ_pos(univ_pos: str) -> Optional[str]:
|
||||||
if isinstance(univ_pos, str):
|
if isinstance(univ_pos, str):
|
||||||
return univ_pos.upper()
|
return univ_pos.upper()
|
||||||
|
|
||||||
symbols_to_str = {
|
symbols_to_str = {
|
||||||
ADJ: "ADJ",
|
ADJ: "ADJ",
|
||||||
DET: "DET",
|
DET: "DET",
|
||||||
|
@ -98,14 +99,14 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
return symbols_to_str[univ_pos]
|
return symbols_to_str[univ_pos]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def lookup(self, string, orth=None):
|
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||||
analyses = self._morph.parse(string)
|
analyses = self._morph.parse(string)
|
||||||
if len(analyses) == 1:
|
if len(analyses) == 1:
|
||||||
return analyses[0].normal_form
|
return analyses[0].normal_form
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
||||||
def oc2ud(oc_tag):
|
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||||
gram_map = {
|
gram_map = {
|
||||||
"_POS": {
|
"_POS": {
|
||||||
"ADJF": "ADJ",
|
"ADJF": "ADJ",
|
||||||
|
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
|
||||||
"Voice": {"actv": "Act", "pssv": "Pass"},
|
"Voice": {"actv": "Act", "pssv": "Pass"},
|
||||||
"Abbr": {"Abbr": "Yes"},
|
"Abbr": {"Abbr": "Yes"},
|
||||||
}
|
}
|
||||||
|
|
||||||
pos = "X"
|
pos = "X"
|
||||||
morphology = dict()
|
morphology = dict()
|
||||||
unmatched = set()
|
unmatched = set()
|
||||||
|
|
||||||
grams = oc_tag.replace(" ", ",").split(",")
|
grams = oc_tag.replace(" ", ",").split(",")
|
||||||
for gram in grams:
|
for gram in grams:
|
||||||
match = False
|
match = False
|
||||||
|
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
|
||||||
morphology[categ] = gmap[gram]
|
morphology[categ] = gmap[gram]
|
||||||
if not match:
|
if not match:
|
||||||
unmatched.add(gram)
|
unmatched.add(gram)
|
||||||
|
|
||||||
while len(unmatched) > 0:
|
while len(unmatched) > 0:
|
||||||
gram = unmatched.pop()
|
gram = unmatched.pop()
|
||||||
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
|
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
|
||||||
|
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
|
||||||
pos = "AUX"
|
pos = "AUX"
|
||||||
elif gram == "Pltm":
|
elif gram == "Pltm":
|
||||||
morphology["Number"] = "Ptan"
|
morphology["Number"] = "Ptan"
|
||||||
|
|
||||||
return pos, morphology
|
return pos, morphology
|
||||||
|
|
||||||
|
|
||||||
PUNCT_RULES = {"«": '"', "»": '"'}
|
|
||||||
|
|
|
@ -1,20 +1,33 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class SinhalaDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lang = "si"
|
||||||
lex_attr_getters[LANG] = lambda text: "si"
|
stop_words = {"@language_data": "spacy.si.stop_words"}
|
||||||
stop_words = STOP_WORDS
|
lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.si.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.si.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Sinhala(Language):
|
class Sinhala(Language):
|
||||||
lang = "si"
|
lang = "si"
|
||||||
Defaults = SinhalaDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Sinhala"]
|
__all__ = ["Sinhala"]
|
||||||
|
|
|
@ -1,20 +1,33 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class SlovakDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lang = "sk"
|
||||||
lex_attr_getters[LANG] = lambda text: "sk"
|
stop_words = {"@language_data": "spacy.sk.stop_words"}
|
||||||
stop_words = STOP_WORDS
|
lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.sk.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.sk.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Slovak(Language):
|
class Slovak(Language):
|
||||||
lang = "sk"
|
lang = "sk"
|
||||||
Defaults = SlovakDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Slovak"]
|
__all__ = ["Slovak"]
|
||||||
|
|
|
@ -1,17 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class SlovenianDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "sl"
|
lang = "sl"
|
||||||
stop_words = STOP_WORDS
|
stop_words = {"@language_data": "spacy.sl.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.sl.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Slovenian(Language):
|
class Slovenian(Language):
|
||||||
lang = "sl"
|
lang = "sl"
|
||||||
Defaults = SlovenianDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Slovenian"]
|
__all__ = ["Slovenian"]
|
||||||
|
|
|
@ -1,17 +1,26 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class AlbanianDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "sq"
|
lang = "sq"
|
||||||
stop_words = STOP_WORDS
|
stop_words = {"@language_data": "spacy.sq.stop_words"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.sq.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Albanian(Language):
|
class Albanian(Language):
|
||||||
lang = "sq"
|
lang = "sq"
|
||||||
Defaults = AlbanianDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Albanian"]
|
__all__ = ["Albanian"]
|
||||||
|
|
|
@ -1,23 +1,47 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "sr"
|
||||||
|
stop_words = {"@language_data": "spacy.sr.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.sr.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.sr.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class SerbianDefaults(Language.Defaults):
|
class SerbianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "sr"
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Serbian(Language):
|
class Serbian(Language):
|
||||||
lang = "sr"
|
lang = "sr"
|
||||||
Defaults = SerbianDefaults
|
Defaults = SerbianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Serbian"]
|
__all__ = ["Serbian"]
|
||||||
|
|
|
@ -1,35 +1,54 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ...language import Language
|
||||||
|
from ...util import update_exc, registry
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
# Punctuation stolen from Danish
|
# Punctuation stolen from Danish
|
||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ..norm_exceptions import BASE_NORMS
|
DEFAULT_CONFIG = """
|
||||||
from ...language import Language
|
[nlp]
|
||||||
from ...attrs import LANG, NORM
|
lang = "sv"
|
||||||
from ...util import update_exc, add_lookups
|
stop_words = {"@language_data": "spacy.sv.stop_words"}
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.sv.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.sv.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class SwedishDefaults(Language.Defaults):
|
class SwedishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "sv"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Swedish(Language):
|
class Swedish(Language):
|
||||||
lang = "sv"
|
lang = "sv"
|
||||||
Defaults = SwedishDefaults
|
Defaults = SwedishDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Swedish"]
|
__all__ = ["Swedish"]
|
||||||
|
|
|
@ -1,20 +1,33 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class TamilDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "ta"
|
lang = "ta"
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
stop_words = {"@language_data": "spacy.ta.stop_words"}
|
||||||
stop_words = STOP_WORDS
|
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ta.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ta.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Tamil(Language):
|
class Tamil(Language):
|
||||||
lang = "ta"
|
lang = "ta"
|
||||||
Defaults = TamilDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Tamil"]
|
__all__ = ["Tamil"]
|
||||||
|
|
|
@ -1,20 +1,33 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
class TeluguDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lang = "te"
|
||||||
lex_attr_getters[LANG] = lambda text: "te"
|
stop_words = {"@language_data": "spacy.te.stop_words"}
|
||||||
stop_words = STOP_WORDS
|
lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.te.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.te.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class Telugu(Language):
|
class Telugu(Language):
|
||||||
lang = "te"
|
lang = "te"
|
||||||
Defaults = TeluguDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Telugu"]
|
__all__ = ["Telugu"]
|
||||||
|
|
|
@ -1,15 +1,44 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ...attrs import LANG
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer
|
from ...util import DummyTokenizer, registry
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "th"
|
||||||
|
stop_words = {"@language_data": "spacy.th.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.tokenizer]
|
||||||
|
@tokenizers = "spacy.ThaiTokenizer.v1"
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.th.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.th.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.ThaiTokenizer.v1")
|
||||||
|
def create_thai_tokenizer():
|
||||||
|
def thai_tokenizer_factory(nlp):
|
||||||
|
return ThaiTokenizer(nlp)
|
||||||
|
|
||||||
|
return thai_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class ThaiTokenizer(DummyTokenizer):
|
class ThaiTokenizer(DummyTokenizer):
|
||||||
def __init__(self, cls, nlp=None):
|
def __init__(self, nlp: Language) -> None:
|
||||||
try:
|
try:
|
||||||
from pythainlp.tokenize import word_tokenize
|
from pythainlp.tokenize import word_tokenize
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -17,34 +46,18 @@ class ThaiTokenizer(DummyTokenizer):
|
||||||
"The Thai tokenizer requires the PyThaiNLP library: "
|
"The Thai tokenizer requires the PyThaiNLP library: "
|
||||||
"https://github.com/PyThaiNLP/pythainlp"
|
"https://github.com/PyThaiNLP/pythainlp"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.word_tokenize = word_tokenize
|
self.word_tokenize = word_tokenize
|
||||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
self.vocab = nlp.vocab
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text: str) -> Doc:
|
||||||
words = list(self.word_tokenize(text))
|
words = list(self.word_tokenize(text))
|
||||||
spaces = [False] * len(words)
|
spaces = [False] * len(words)
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
|
||||||
class ThaiDefaults(Language.Defaults):
|
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda _text: "th"
|
|
||||||
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
|
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_tokenizer(cls, nlp=None):
|
|
||||||
return ThaiTokenizer(cls, nlp)
|
|
||||||
|
|
||||||
|
|
||||||
class Thai(Language):
|
class Thai(Language):
|
||||||
lang = "th"
|
lang = "th"
|
||||||
Defaults = ThaiDefaults
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
def make_doc(self, text):
|
|
||||||
return self.tokenizer(text)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Thai"]
|
__all__ = ["Thai"]
|
||||||
|
|
|
@ -1,31 +1,47 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
|
||||||
def _return_tl(_):
|
DEFAULT_CONFIG = """
|
||||||
return "tl"
|
[nlp]
|
||||||
|
lang = "tl"
|
||||||
|
stop_words = {"@language_data": "spacy.tl.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.tl.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.tl.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class TagalogDefaults(Language.Defaults):
|
class TagalogDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = _return_tl
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Tagalog(Language):
|
class Tagalog(Language):
|
||||||
lang = "tl"
|
lang = "tl"
|
||||||
Defaults = TagalogDefaults
|
Defaults = TagalogDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Tagalog"]
|
__all__ = ["Tagalog"]
|
||||||
|
|
|
@ -1,26 +1,40 @@
|
||||||
|
from typing import Set
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "tr"
|
||||||
|
stop_words = {"@language_data": "spacy.tr.stop_words"}
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.tr.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class TurkishDefaults(Language.Defaults):
|
class TurkishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "tr"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Turkish(Language):
|
class Turkish(Language):
|
||||||
lang = "tr"
|
lang = "tr"
|
||||||
Defaults = TurkishDefaults
|
Defaults = TurkishDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Turkish"]
|
__all__ = ["Turkish"]
|
||||||
|
|
|
@ -1,28 +1,42 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...attrs import LANG
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...util import update_exc
|
from ...util import update_exc, registry
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "tt"
|
||||||
|
stop_words = {"@language_data": "spacy.tt.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.tt.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.tt.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class TatarDefaults(Language.Defaults):
|
class TatarDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "tt"
|
|
||||||
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
|
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Tatar(Language):
|
class Tatar(Language):
|
||||||
lang = "tt"
|
lang = "tt"
|
||||||
Defaults = TatarDefaults
|
Defaults = TatarDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Tatar"]
|
__all__ = ["Tatar"]
|
||||||
|
|
|
@ -1,36 +1,49 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ...util import update_exc, registry
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...attrs import LANG, NORM
|
|
||||||
from .lemmatizer import UkrainianLemmatizer
|
from .lemmatizer import UkrainianLemmatizer
|
||||||
|
|
||||||
|
|
||||||
class UkrainianDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "uk"
|
lang = "uk"
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
stop_words = {"@language_data": "spacy.uk.stop_words"}
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
|
||||||
)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
@classmethod
|
[nlp.lemmatizer]
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
@lemmatizers = "spacy.UkrainianLemmatizer.v1"
|
||||||
if lookups is None:
|
"""
|
||||||
lookups = Lookups()
|
|
||||||
return UkrainianLemmatizer(lookups)
|
|
||||||
|
@registry.language_data("spacy.uk.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.uk.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
|
||||||
|
def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
|
||||||
|
return UkrainianLemmatizer()
|
||||||
|
|
||||||
|
|
||||||
|
class UkrainianDefaults(Language.Defaults):
|
||||||
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
|
||||||
|
|
||||||
class Ukrainian(Language):
|
class Ukrainian(Language):
|
||||||
lang = "uk"
|
lang = "uk"
|
||||||
Defaults = UkrainianDefaults
|
Defaults = UkrainianDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Ukrainian"]
|
__all__ = ["Ukrainian"]
|
||||||
|
|
|
@ -1,11 +1,17 @@
|
||||||
|
from typing import Optional, List, Tuple, Dict
|
||||||
|
|
||||||
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
|
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
|
||||||
|
from ...lookups import Lookups
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
PUNCT_RULES = {"«": '"', "»": '"'}
|
||||||
|
|
||||||
|
|
||||||
class UkrainianLemmatizer(Lemmatizer):
|
class UkrainianLemmatizer(Lemmatizer):
|
||||||
_morph = None
|
_morph = None
|
||||||
|
|
||||||
def __init__(self, lookups=None):
|
def __init__(self, lookups: Optional[Lookups] = None) -> None:
|
||||||
super(UkrainianLemmatizer, self).__init__(lookups)
|
super(UkrainianLemmatizer, self).__init__(lookups)
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
|
@ -19,15 +25,15 @@ class UkrainianLemmatizer(Lemmatizer):
|
||||||
'"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
|
'"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
|
||||||
)
|
)
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(
|
||||||
|
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||||
|
) -> List[str]:
|
||||||
univ_pos = self.normalize_univ_pos(univ_pos)
|
univ_pos = self.normalize_univ_pos(univ_pos)
|
||||||
if univ_pos == "PUNCT":
|
if univ_pos == "PUNCT":
|
||||||
return [PUNCT_RULES.get(string, string)]
|
return [PUNCT_RULES.get(string, string)]
|
||||||
|
|
||||||
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
||||||
# Skip unchangeable pos
|
# Skip unchangeable pos
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
|
|
||||||
analyses = self._morph.parse(string)
|
analyses = self._morph.parse(string)
|
||||||
filtered_analyses = []
|
filtered_analyses = []
|
||||||
for analysis in analyses:
|
for analysis in analyses:
|
||||||
|
@ -39,12 +45,10 @@ class UkrainianLemmatizer(Lemmatizer):
|
||||||
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
||||||
):
|
):
|
||||||
filtered_analyses.append(analysis)
|
filtered_analyses.append(analysis)
|
||||||
|
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
||||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||||
|
|
||||||
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
||||||
features_to_compare = ["Case", "Number", "Gender"]
|
features_to_compare = ["Case", "Number", "Gender"]
|
||||||
elif univ_pos == "NUM":
|
elif univ_pos == "NUM":
|
||||||
|
@ -61,7 +65,6 @@ class UkrainianLemmatizer(Lemmatizer):
|
||||||
"VerbForm",
|
"VerbForm",
|
||||||
"Voice",
|
"Voice",
|
||||||
]
|
]
|
||||||
|
|
||||||
analyses, filtered_analyses = filtered_analyses, []
|
analyses, filtered_analyses = filtered_analyses, []
|
||||||
for analysis in analyses:
|
for analysis in analyses:
|
||||||
_, analysis_morph = oc2ud(str(analysis.tag))
|
_, analysis_morph = oc2ud(str(analysis.tag))
|
||||||
|
@ -74,16 +77,14 @@ class UkrainianLemmatizer(Lemmatizer):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
filtered_analyses.append(analysis)
|
filtered_analyses.append(analysis)
|
||||||
|
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def normalize_univ_pos(univ_pos):
|
def normalize_univ_pos(univ_pos: str) -> Optional[str]:
|
||||||
if isinstance(univ_pos, str):
|
if isinstance(univ_pos, str):
|
||||||
return univ_pos.upper()
|
return univ_pos.upper()
|
||||||
|
|
||||||
symbols_to_str = {
|
symbols_to_str = {
|
||||||
ADJ: "ADJ",
|
ADJ: "ADJ",
|
||||||
DET: "DET",
|
DET: "DET",
|
||||||
|
@ -98,14 +99,14 @@ class UkrainianLemmatizer(Lemmatizer):
|
||||||
return symbols_to_str[univ_pos]
|
return symbols_to_str[univ_pos]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def lookup(self, string, orth=None):
|
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||||
analyses = self._morph.parse(string)
|
analyses = self._morph.parse(string)
|
||||||
if len(analyses) == 1:
|
if len(analyses) == 1:
|
||||||
return analyses[0].normal_form
|
return analyses[0].normal_form
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
|
||||||
def oc2ud(oc_tag):
|
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||||
gram_map = {
|
gram_map = {
|
||||||
"_POS": {
|
"_POS": {
|
||||||
"ADJF": "ADJ",
|
"ADJF": "ADJ",
|
||||||
|
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
|
||||||
"Voice": {"actv": "Act", "pssv": "Pass"},
|
"Voice": {"actv": "Act", "pssv": "Pass"},
|
||||||
"Abbr": {"Abbr": "Yes"},
|
"Abbr": {"Abbr": "Yes"},
|
||||||
}
|
}
|
||||||
|
|
||||||
pos = "X"
|
pos = "X"
|
||||||
morphology = dict()
|
morphology = dict()
|
||||||
unmatched = set()
|
unmatched = set()
|
||||||
|
|
||||||
grams = oc_tag.replace(" ", ",").split(",")
|
grams = oc_tag.replace(" ", ",").split(",")
|
||||||
for gram in grams:
|
for gram in grams:
|
||||||
match = False
|
match = False
|
||||||
|
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
|
||||||
morphology[categ] = gmap[gram]
|
morphology[categ] = gmap[gram]
|
||||||
if not match:
|
if not match:
|
||||||
unmatched.add(gram)
|
unmatched.add(gram)
|
||||||
|
|
||||||
while len(unmatched) > 0:
|
while len(unmatched) > 0:
|
||||||
gram = unmatched.pop()
|
gram = unmatched.pop()
|
||||||
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
|
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
|
||||||
|
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
|
||||||
pos = "AUX"
|
pos = "AUX"
|
||||||
elif gram == "Pltm":
|
elif gram == "Pltm":
|
||||||
morphology["Number"] = "Ptan"
|
morphology["Number"] = "Ptan"
|
||||||
|
|
||||||
return pos, morphology
|
return pos, morphology
|
||||||
|
|
||||||
|
|
||||||
PUNCT_RULES = {"«": '"', "»": '"'}
|
|
||||||
|
|
|
@ -1,26 +1,53 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "ur"
|
||||||
|
stop_words = {"@language_data": "spacy.ur.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
|
||||||
|
|
||||||
|
[nlp.writing_system]
|
||||||
|
direction = "rtl"
|
||||||
|
has_case = false
|
||||||
|
has_letters = true
|
||||||
|
|
||||||
|
[nlp.lemmatizer]
|
||||||
|
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||||
|
|
||||||
|
[nlp.lemmatizer.data_paths]
|
||||||
|
@language_data = "spacy-lookups-data"
|
||||||
|
lang = ${nlp:lang}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ur.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.ur.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class UrduDefaults(Language.Defaults):
|
class UrduDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "ur"
|
|
||||||
|
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
|
||||||
|
|
||||||
|
|
||||||
class Urdu(Language):
|
class Urdu(Language):
|
||||||
lang = "ur"
|
lang = "ur"
|
||||||
Defaults = UrduDefaults
|
Defaults = UrduDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Urdu"]
|
__all__ = ["Urdu"]
|
||||||
|
|
|
@ -1,38 +1,62 @@
|
||||||
from ...attrs import LANG, NORM
|
from typing import Set, Dict, Callable, Any
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from thinc.api import Config
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...util import add_lookups
|
from ...util import DummyTokenizer, registry
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class VietnameseDefaults(Language.Defaults):
|
DEFAULT_CONFIG = """
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
[nlp]
|
||||||
lex_attr_getters[LANG] = lambda text: "vi" # for pickling
|
lang = "vi"
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
stop_words = {"@language_data": "spacy.vi.stop_words"}
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
|
||||||
)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
[nlp.tokenizer]
|
||||||
stop_words = STOP_WORDS
|
@tokenizers = "spacy.VietnameseTokenizer.v1"
|
||||||
use_pyvi = True
|
use_pyvi = true
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class Vietnamese(Language):
|
@registry.language_data("spacy.vi.stop_words")
|
||||||
lang = "vi"
|
def stop_words() -> Set[str]:
|
||||||
Defaults = VietnameseDefaults # override defaults
|
return STOP_WORDS
|
||||||
|
|
||||||
def make_doc(self, text):
|
|
||||||
if self.Defaults.use_pyvi:
|
@registry.language_data("spacy.vi.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.VietnameseTokenizer.v1")
|
||||||
|
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
||||||
|
def vietnamese_tokenizer_factory(nlp):
|
||||||
|
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||||
|
|
||||||
|
return vietnamese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
|
class VietnameseTokenizer(DummyTokenizer):
|
||||||
|
def __init__(self, nlp: Language, use_pyvi: bool = False):
|
||||||
|
self.vocab = nlp.vocab
|
||||||
|
self.use_pyvi = use_pyvi
|
||||||
|
if self.use_pyvi:
|
||||||
try:
|
try:
|
||||||
from pyvi import ViTokenizer
|
from pyvi import ViTokenizer
|
||||||
|
|
||||||
|
self.ViTokenizer = ViTokenizer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
msg = (
|
msg = (
|
||||||
"Pyvi not installed. Either set Vietnamese.use_pyvi = False, "
|
"Pyvi not installed. Either set use_pyvi = False, "
|
||||||
"or install it https://pypi.python.org/pypi/pyvi"
|
"or install it https://pypi.python.org/pypi/pyvi"
|
||||||
)
|
)
|
||||||
raise ImportError(msg)
|
raise ImportError(msg)
|
||||||
words, spaces = ViTokenizer.spacy_tokenize(text)
|
|
||||||
|
def __call__(self, text: str) -> Doc:
|
||||||
|
if self.use_pyvi:
|
||||||
|
words, spaces = self.ViTokenizer.spacy_tokenize(text)
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
else:
|
else:
|
||||||
words = []
|
words = []
|
||||||
|
@ -44,4 +68,9 @@ class Vietnamese(Language):
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
|
||||||
|
class Vietnamese(Language):
|
||||||
|
lang = "vi"
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Vietnamese"]
|
__all__ = ["Vietnamese"]
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
|
||||||
from ...util import update_exc, add_lookups
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "xx"
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class MultiLanguageDefaults(Language.Defaults):
|
class MultiLanguageDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||||
lex_attr_getters[LANG] = lambda text: "xx"
|
|
||||||
lex_attr_getters[NORM] = add_lookups(
|
|
||||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
|
||||||
)
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
|
||||||
|
|
||||||
|
|
||||||
class MultiLanguage(Language):
|
class MultiLanguage(Language):
|
||||||
|
@ -21,6 +21,7 @@ class MultiLanguage(Language):
|
||||||
|
|
||||||
lang = "xx"
|
lang = "xx"
|
||||||
Defaults = MultiLanguageDefaults
|
Defaults = MultiLanguageDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["MultiLanguage"]
|
__all__ = ["MultiLanguage"]
|
||||||
|
|
|
@ -1,21 +1,39 @@
|
||||||
|
from typing import Set, Dict, Callable, Any
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "si"
|
||||||
|
stop_words = {"@language_data": "spacy.yo.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.yo.stop_words")
|
||||||
|
def stop_words() -> Set[str]:
|
||||||
|
return STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy.yo.lex_attr_getters")
|
||||||
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
|
return LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
class YorubaDefaults(Language.Defaults):
|
class YorubaDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "yo"
|
|
||||||
stop_words = STOP_WORDS
|
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class Yoruba(Language):
|
class Yoruba(Language):
|
||||||
lang = "yo"
|
lang = "yo"
|
||||||
Defaults = YorubaDefaults
|
Defaults = YorubaDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Yoruba"]
|
__all__ = ["Yoruba"]
|
||||||
|
|
|
@ -1,13 +1,15 @@
|
||||||
|
from typing import Optional, List, Set, Dict, Callable, Any
|
||||||
|
from enum import Enum
|
||||||
import tempfile
|
import tempfile
|
||||||
import srsly
|
import srsly
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import OrderedDict
|
from thinc.api import Config
|
||||||
from ...attrs import LANG
|
|
||||||
from ...errors import Warnings, Errors
|
from ...errors import Warnings, Errors
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import DummyTokenizer
|
from ...util import DummyTokenizer, registry
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
@ -16,88 +18,103 @@ from ... import util
|
||||||
|
|
||||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
|
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
|
||||||
|
|
||||||
|
DEFAULT_CONFIG = """
|
||||||
|
[nlp]
|
||||||
|
lang = "zh"
|
||||||
|
stop_words = {"@language_data": "spacy.zh.stop_words"}
|
||||||
|
lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
|
||||||
|
|
||||||
def try_jieba_import(segmenter):
|
[nlp.tokenizer]
|
||||||
try:
|
@tokenizers = "spacy.ChineseTokenizer.v1"
|
||||||
import jieba
|
segmenter = "char"
|
||||||
|
pkuseg_model = null
|
||||||
|
pkuseg_user_dict = "default"
|
||||||
|
|
||||||
if segmenter == "jieba":
|
[nlp.writing_system]
|
||||||
# segment a short text to have jieba initialize its cache in advance
|
direction = "ltr"
|
||||||
list(jieba.cut("作为", cut_all=False))
|
has_case = false
|
||||||
|
has_letters = false
|
||||||
return jieba
|
"""
|
||||||
except ImportError:
|
|
||||||
if segmenter == "jieba":
|
|
||||||
msg = (
|
|
||||||
"Jieba not installed. To use jieba, install it with `pip "
|
|
||||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
|
||||||
)
|
|
||||||
raise ImportError(msg)
|
|
||||||
|
|
||||||
|
|
||||||
def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
|
class Segmenter(str, Enum):
|
||||||
try:
|
char = "char"
|
||||||
import pkuseg
|
jieba = "jieba"
|
||||||
|
pkuseg = "pkuseg"
|
||||||
|
|
||||||
if pkuseg_model:
|
@classmethod
|
||||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
def values(cls):
|
||||||
elif segmenter == "pkuseg":
|
return list(cls.__members__.keys())
|
||||||
msg = (
|
|
||||||
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
|
|
||||||
"was specified. Please provide the name of a pretrained model "
|
@registry.language_data("spacy.zh.stop_words")
|
||||||
"or the path to a model with "
|
def stop_words() -> Set[str]:
|
||||||
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
|
return STOP_WORDS
|
||||||
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
|
|
||||||
)
|
|
||||||
raise ValueError(msg)
|
@registry.language_data("spacy.zh.lex_attr_getters")
|
||||||
except ImportError:
|
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||||
if segmenter == "pkuseg":
|
return LEX_ATTRS
|
||||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
|
||||||
raise ImportError(msg)
|
|
||||||
except FileNotFoundError:
|
@registry.tokenizers("spacy.ChineseTokenizer.v1")
|
||||||
if segmenter == "pkuseg":
|
def create_chinese_tokenizer(
|
||||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
segmenter: Segmenter = Segmenter.char,
|
||||||
raise FileNotFoundError(msg)
|
pkuseg_model: Optional[str] = None,
|
||||||
|
pkuseg_user_dict: Optional[str] = "default",
|
||||||
|
):
|
||||||
|
def chinese_tokenizer_factory(nlp):
|
||||||
|
return ChineseTokenizer(
|
||||||
|
nlp,
|
||||||
|
segmenter=segmenter,
|
||||||
|
pkuseg_model=pkuseg_model,
|
||||||
|
pkuseg_user_dict=pkuseg_user_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
return chinese_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, cls, nlp=None, config={}):
|
def __init__(
|
||||||
self.supported_segmenters = ("char", "jieba", "pkuseg")
|
self,
|
||||||
self.configure_segmenter(config)
|
nlp: Language,
|
||||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
segmenter: Segmenter = Segmenter.char,
|
||||||
# remove relevant settings from config so they're not also saved in
|
pkuseg_model: Optional[str] = None,
|
||||||
# Language.meta
|
pkuseg_user_dict: Optional[str] = None,
|
||||||
for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
|
):
|
||||||
if key in config:
|
self.vocab = nlp.vocab
|
||||||
del config[key]
|
if isinstance(segmenter, Segmenter): # we might have the Enum here
|
||||||
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
segmenter = segmenter.value
|
||||||
|
self.segmenter = segmenter
|
||||||
|
self.pkuseg_model = pkuseg_model
|
||||||
|
self.pkuseg_user_dict = pkuseg_user_dict
|
||||||
|
self.pkuseg_seg = None
|
||||||
|
self.jieba_seg = None
|
||||||
|
self.configure_segmenter(segmenter)
|
||||||
|
|
||||||
def configure_segmenter(self, config):
|
def configure_segmenter(self, segmenter: str):
|
||||||
self.segmenter = "char"
|
if segmenter not in Segmenter.values():
|
||||||
if "segmenter" in config:
|
warn_msg = Warnings.W103.format(
|
||||||
if config["segmenter"] in self.supported_segmenters:
|
lang="Chinese",
|
||||||
self.segmenter = config["segmenter"]
|
segmenter=segmenter,
|
||||||
else:
|
supported=", ".join(Segmenter.values()),
|
||||||
warn_msg = Warnings.W103.format(
|
default="'char' (character segmentation)",
|
||||||
lang="Chinese",
|
)
|
||||||
segmenter=config["segmenter"],
|
warnings.warn(warn_msg)
|
||||||
supported=", ".join([repr(s) for s in self.supported_segmenters]),
|
self.segmenter = Segmenter.char
|
||||||
default="'char' (character segmentation)",
|
|
||||||
)
|
|
||||||
warnings.warn(warn_msg)
|
|
||||||
self.jieba_seg = try_jieba_import(self.segmenter)
|
self.jieba_seg = try_jieba_import(self.segmenter)
|
||||||
self.pkuseg_seg = try_pkuseg_import(
|
self.pkuseg_seg = try_pkuseg_import(
|
||||||
self.segmenter,
|
self.segmenter,
|
||||||
pkuseg_model=config.get("pkuseg_model", None),
|
pkuseg_model=self.pkuseg_model,
|
||||||
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
|
pkuseg_user_dict=self.pkuseg_user_dict,
|
||||||
)
|
)
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text: str) -> Doc:
|
||||||
if self.segmenter == "jieba":
|
if self.segmenter == Segmenter.jieba:
|
||||||
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
||||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
elif self.segmenter == "pkuseg":
|
elif self.segmenter == Segmenter.pkuseg:
|
||||||
if self.pkuseg_seg is None:
|
if self.pkuseg_seg is None:
|
||||||
raise ValueError(Errors.E1000)
|
raise ValueError(Errors.E1000)
|
||||||
words = self.pkuseg_seg.cut(text)
|
words = self.pkuseg_seg.cut(text)
|
||||||
|
@ -105,11 +122,11 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
# warn if segmenter setting is not the only remaining option "char"
|
# warn if segmenter setting is not the only remaining option "char"
|
||||||
if self.segmenter != "char":
|
if self.segmenter != Segmenter.char:
|
||||||
warn_msg = Warnings.W103.format(
|
warn_msg = Warnings.W103.format(
|
||||||
lang="Chinese",
|
lang="Chinese",
|
||||||
segmenter=self.segmenter,
|
segmenter=self.segmenter,
|
||||||
supported=", ".join([repr(s) for s in self.supported_segmenters]),
|
supported=", ".join(Segmenter.values()),
|
||||||
default="'char' (character segmentation)",
|
default="'char' (character segmentation)",
|
||||||
)
|
)
|
||||||
warnings.warn(warn_msg)
|
warnings.warn(warn_msg)
|
||||||
|
@ -119,33 +136,25 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
def pkuseg_update_user_dict(self, words, reset=False):
|
def pkuseg_update_user_dict(self, words: List[str], reset: bool = False):
|
||||||
if self.segmenter == "pkuseg":
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
if reset:
|
if reset:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
|
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.segmenter == "pkuseg":
|
msg = (
|
||||||
msg = (
|
"pkuseg not installed: unable to reset pkuseg "
|
||||||
"pkuseg not installed: unable to reset pkuseg "
|
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
)
|
||||||
)
|
raise ImportError(msg)
|
||||||
raise ImportError(msg)
|
|
||||||
for word in words:
|
for word in words:
|
||||||
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
|
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
|
||||||
else:
|
else:
|
||||||
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
||||||
warnings.warn(warn_msg)
|
warnings.warn(warn_msg)
|
||||||
|
|
||||||
def _get_config(self):
|
|
||||||
config = OrderedDict((("segmenter", self.segmenter),))
|
|
||||||
return config
|
|
||||||
|
|
||||||
def _set_config(self, config={}):
|
|
||||||
self.configure_segmenter(config)
|
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
pkuseg_features_b = b""
|
pkuseg_features_b = b""
|
||||||
pkuseg_weights_b = b""
|
pkuseg_weights_b = b""
|
||||||
|
@ -165,17 +174,11 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
sorted(list(self.pkuseg_seg.postprocesser.common_words)),
|
sorted(list(self.pkuseg_seg.postprocesser.common_words)),
|
||||||
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
|
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
|
||||||
)
|
)
|
||||||
serializers = OrderedDict(
|
serializers = {
|
||||||
(
|
"pkuseg_features": lambda: pkuseg_features_b,
|
||||||
("cfg", lambda: srsly.json_dumps(self._get_config())),
|
"pkuseg_weights": lambda: pkuseg_weights_b,
|
||||||
("pkuseg_features", lambda: pkuseg_features_b),
|
"pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
|
||||||
("pkuseg_weights", lambda: pkuseg_weights_b),
|
}
|
||||||
(
|
|
||||||
"pkuseg_processors",
|
|
||||||
lambda: srsly.msgpack_dumps(pkuseg_processors_data),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return util.to_bytes(serializers, [])
|
return util.to_bytes(serializers, [])
|
||||||
|
|
||||||
def from_bytes(self, data, **kwargs):
|
def from_bytes(self, data, **kwargs):
|
||||||
|
@ -190,14 +193,11 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
def deserialize_pkuseg_processors(b):
|
def deserialize_pkuseg_processors(b):
|
||||||
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
|
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
|
||||||
|
|
||||||
deserializers = OrderedDict(
|
deserializers = {
|
||||||
(
|
"pkuseg_features": deserialize_pkuseg_features,
|
||||||
("cfg", lambda b: self._set_config(srsly.json_loads(b))),
|
"pkuseg_weights": deserialize_pkuseg_weights,
|
||||||
("pkuseg_features", deserialize_pkuseg_features),
|
"pkuseg_processors": deserialize_pkuseg_processors,
|
||||||
("pkuseg_weights", deserialize_pkuseg_weights),
|
}
|
||||||
("pkuseg_processors", deserialize_pkuseg_processors),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
util.from_bytes(data, deserializers, [])
|
util.from_bytes(data, deserializers, [])
|
||||||
|
|
||||||
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
||||||
|
@ -245,13 +245,10 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
srsly.write_msgpack(path, data)
|
srsly.write_msgpack(path, data)
|
||||||
|
|
||||||
serializers = OrderedDict(
|
serializers = {
|
||||||
(
|
"pkuseg_model": lambda p: save_pkuseg_model(p),
|
||||||
("cfg", lambda p: srsly.write_json(p, self._get_config())),
|
"pkuseg_processors": lambda p: save_pkuseg_processors(p),
|
||||||
("pkuseg_model", lambda p: save_pkuseg_model(p)),
|
}
|
||||||
("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return util.to_disk(path, serializers, [])
|
return util.to_disk(path, serializers, [])
|
||||||
|
|
||||||
def from_disk(self, path, **kwargs):
|
def from_disk(self, path, **kwargs):
|
||||||
|
@ -261,7 +258,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.segmenter == "pkuseg":
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pkuseg not installed. To use this model, "
|
"pkuseg not installed. To use this model, "
|
||||||
+ _PKUSEG_INSTALL_MSG
|
+ _PKUSEG_INSTALL_MSG
|
||||||
|
@ -273,9 +270,9 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.segmenter == "pkuseg":
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
raise ImportError(self._pkuseg_install_msg)
|
raise ImportError(self._pkuseg_install_msg)
|
||||||
if self.segmenter == "pkuseg":
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
data = srsly.read_msgpack(path)
|
data = srsly.read_msgpack(path)
|
||||||
(user_dict, do_process, common_words, other_words) = data
|
(user_dict, do_process, common_words, other_words) = data
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||||
|
@ -283,35 +280,64 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||||
|
|
||||||
serializers = OrderedDict(
|
serializers = {
|
||||||
(
|
"pkuseg_model": lambda p: load_pkuseg_model(p),
|
||||||
("cfg", lambda p: self._set_config(srsly.read_json(p))),
|
"pkuseg_processors": lambda p: load_pkuseg_processors(p),
|
||||||
("pkuseg_model", lambda p: load_pkuseg_model(p)),
|
}
|
||||||
("pkuseg_processors", lambda p: load_pkuseg_processors(p)),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
util.from_disk(path, serializers, [])
|
util.from_disk(path, serializers, [])
|
||||||
|
|
||||||
|
|
||||||
class ChineseDefaults(Language.Defaults):
|
class ChineseDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
|
||||||
lex_attr_getters[LANG] = lambda text: "zh"
|
|
||||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_tokenizer(cls, nlp=None, config={}):
|
|
||||||
return ChineseTokenizer(cls, nlp, config=config)
|
|
||||||
|
|
||||||
|
|
||||||
class Chinese(Language):
|
class Chinese(Language):
|
||||||
lang = "zh"
|
lang = "zh"
|
||||||
Defaults = ChineseDefaults # override defaults
|
Defaults = ChineseDefaults
|
||||||
|
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||||
|
|
||||||
def make_doc(self, text):
|
|
||||||
return self.tokenizer(text)
|
def try_jieba_import(segmenter: str) -> None:
|
||||||
|
try:
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
if segmenter == Segmenter.jieba:
|
||||||
|
# segment a short text to have jieba initialize its cache in advance
|
||||||
|
list(jieba.cut("作为", cut_all=False))
|
||||||
|
|
||||||
|
return jieba
|
||||||
|
except ImportError:
|
||||||
|
if segmenter == Segmenter.jieba:
|
||||||
|
msg = (
|
||||||
|
"Jieba not installed. To use jieba, install it with `pip "
|
||||||
|
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||||
|
)
|
||||||
|
raise ImportError(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||||
|
try:
|
||||||
|
import pkuseg
|
||||||
|
|
||||||
|
if pkuseg_model:
|
||||||
|
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||||
|
elif segmenter == Segmenter.pkuseg:
|
||||||
|
msg = (
|
||||||
|
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
|
||||||
|
"was specified. Please provide the name of a pretrained model "
|
||||||
|
"or the path to a model with:\n"
|
||||||
|
'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
|
||||||
|
"nlp = Chinese.from_config(cfg)"
|
||||||
|
)
|
||||||
|
raise ValueError(msg)
|
||||||
|
except ImportError:
|
||||||
|
if segmenter == Segmenter.pkuseg:
|
||||||
|
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||||
|
raise ImportError(msg)
|
||||||
|
except FileNotFoundError:
|
||||||
|
if segmenter == Segmenter.pkuseg:
|
||||||
|
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||||
|
raise FileNotFoundError(msg)
|
||||||
|
|
||||||
|
|
||||||
def _get_pkuseg_trie_data(node, path=""):
|
def _get_pkuseg_trie_data(node, path=""):
|
||||||
|
|
1052
spacy/language.py
1052
spacy/language.py
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,14 @@
|
||||||
|
from typing import Optional, Callable, List, Dict
|
||||||
|
|
||||||
|
from .lookups import Lookups
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .parts_of_speech import NAMES as UPOS_NAMES
|
from .parts_of_speech import NAMES as UPOS_NAMES
|
||||||
|
from .util import registry, load_language_data, SimpleFrozenDict
|
||||||
|
|
||||||
|
|
||||||
|
@registry.lemmatizers("spacy.Lemmatizer.v1")
|
||||||
|
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
|
||||||
|
return Lemmatizer(data_paths=data_paths)
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer:
|
class Lemmatizer:
|
||||||
|
@ -14,17 +23,27 @@ class Lemmatizer:
|
||||||
def load(cls, *args, **kwargs):
|
def load(cls, *args, **kwargs):
|
||||||
raise NotImplementedError(Errors.E172)
|
raise NotImplementedError(Errors.E172)
|
||||||
|
|
||||||
def __init__(self, lookups, is_base_form=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
lookups: Optional[Lookups] = None,
|
||||||
|
data_paths: dict = SimpleFrozenDict(),
|
||||||
|
is_base_form: Optional[Callable] = None,
|
||||||
|
) -> None:
|
||||||
"""Initialize a Lemmatizer.
|
"""Initialize a Lemmatizer.
|
||||||
|
|
||||||
lookups (Lookups): The lookups object containing the (optional) tables
|
lookups (Lookups): The lookups object containing the (optional) tables
|
||||||
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
|
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
|
||||||
RETURNS (Lemmatizer): The newly constructed object.
|
RETURNS (Lemmatizer): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
self.lookups = lookups
|
self.lookups = lookups if lookups is not None else Lookups()
|
||||||
|
for name, filename in data_paths.items():
|
||||||
|
data = load_language_data(filename)
|
||||||
|
self.lookups.add_table(name, data)
|
||||||
self.is_base_form = is_base_form
|
self.is_base_form = is_base_form
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(
|
||||||
|
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||||
|
) -> List[str]:
|
||||||
"""Lemmatize a string.
|
"""Lemmatize a string.
|
||||||
|
|
||||||
string (str): The string to lemmatize, e.g. the token text.
|
string (str): The string to lemmatize, e.g. the token text.
|
||||||
|
@ -39,7 +58,6 @@ class Lemmatizer:
|
||||||
if isinstance(univ_pos, int):
|
if isinstance(univ_pos, int):
|
||||||
univ_pos = UPOS_NAMES.get(univ_pos, "X")
|
univ_pos = UPOS_NAMES.get(univ_pos, "X")
|
||||||
univ_pos = univ_pos.lower()
|
univ_pos = univ_pos.lower()
|
||||||
|
|
||||||
if univ_pos in ("", "eol", "space"):
|
if univ_pos in ("", "eol", "space"):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
|
@ -67,65 +85,31 @@ class Lemmatizer:
|
||||||
)
|
)
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
def is_base_form(self, univ_pos, morphology=None):
|
def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||||
"""
|
|
||||||
Check whether we're dealing with an uninflected paradigm, so we can
|
|
||||||
avoid lemmatization entirely.
|
|
||||||
|
|
||||||
univ_pos (str / int): The token's universal part-of-speech tag.
|
|
||||||
morphology (dict): The token's morphological features following the
|
|
||||||
Universal Dependencies scheme.
|
|
||||||
"""
|
|
||||||
if morphology is None:
|
|
||||||
morphology = {}
|
|
||||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
|
||||||
return True
|
|
||||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
|
||||||
return True
|
|
||||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
|
||||||
# morphology
|
|
||||||
elif univ_pos == "verb" and (
|
|
||||||
morphology.get("VerbForm") == "fin"
|
|
||||||
and morphology.get("Tense") == "pres"
|
|
||||||
and morphology.get("Number") is None
|
|
||||||
):
|
|
||||||
return True
|
|
||||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
|
||||||
return True
|
|
||||||
elif morphology.get("VerbForm") == "inf":
|
|
||||||
return True
|
|
||||||
elif morphology.get("VerbForm") == "none":
|
|
||||||
return True
|
|
||||||
elif morphology.get("Degree") == "pos":
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def noun(self, string, morphology=None):
|
|
||||||
return self(string, "noun", morphology)
|
return self(string, "noun", morphology)
|
||||||
|
|
||||||
def verb(self, string, morphology=None):
|
def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||||
return self(string, "verb", morphology)
|
return self(string, "verb", morphology)
|
||||||
|
|
||||||
def adj(self, string, morphology=None):
|
def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||||
return self(string, "adj", morphology)
|
return self(string, "adj", morphology)
|
||||||
|
|
||||||
def det(self, string, morphology=None):
|
def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||||
return self(string, "det", morphology)
|
return self(string, "det", morphology)
|
||||||
|
|
||||||
def pron(self, string, morphology=None):
|
def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||||
return self(string, "pron", morphology)
|
return self(string, "pron", morphology)
|
||||||
|
|
||||||
def adp(self, string, morphology=None):
|
def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||||
return self(string, "adp", morphology)
|
return self(string, "adp", morphology)
|
||||||
|
|
||||||
def num(self, string, morphology=None):
|
def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||||
return self(string, "num", morphology)
|
return self(string, "num", morphology)
|
||||||
|
|
||||||
def punct(self, string, morphology=None):
|
def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||||
return self(string, "punct", morphology)
|
return self(string, "punct", morphology)
|
||||||
|
|
||||||
def lookup(self, string, orth=None):
|
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||||
"""Look up a lemma in the table, if available. If no lemma is found,
|
"""Look up a lemma in the table, if available. If no lemma is found,
|
||||||
the original string is returned.
|
the original string is returned.
|
||||||
|
|
||||||
|
@ -141,7 +125,13 @@ class Lemmatizer:
|
||||||
return lookup_table[key]
|
return lookup_table[key]
|
||||||
return string
|
return string
|
||||||
|
|
||||||
def lemmatize(self, string, index, exceptions, rules):
|
def lemmatize(
|
||||||
|
self,
|
||||||
|
string: str,
|
||||||
|
index: Dict[str, List[str]],
|
||||||
|
exceptions: Dict[str, Dict[str, List[str]]],
|
||||||
|
rules: Dict[str, List[List[str]]],
|
||||||
|
) -> List[str]:
|
||||||
orig = string
|
orig = string
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
|
|
|
@ -1,15 +1,32 @@
|
||||||
|
from typing import Dict, Any, List, Union, Optional
|
||||||
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
from preshed.bloom import BloomFilter
|
from preshed.bloom import BloomFilter
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .util import SimpleFrozenDict, ensure_path
|
from .util import SimpleFrozenDict, ensure_path, registry
|
||||||
from .strings import get_string_id
|
from .strings import get_string_id
|
||||||
|
|
||||||
|
|
||||||
UNSET = object()
|
UNSET = object()
|
||||||
|
|
||||||
|
|
||||||
|
@registry.language_data("spacy-lookups-data")
|
||||||
|
def get_lookups(lang: str) -> Dict[str, Any]:
|
||||||
|
"""Load the data from the spacy-lookups-data package for a given language,
|
||||||
|
if available. Returns an empty dict if there's no data or if the package
|
||||||
|
is not installed.
|
||||||
|
|
||||||
|
lang (str): The language code (corresponds to entry point exposed by
|
||||||
|
the spacy-lookups-data package).
|
||||||
|
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
|
||||||
|
"""
|
||||||
|
if lang in registry.lookups:
|
||||||
|
return registry.lookups.get(lang)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
class Lookups:
|
class Lookups:
|
||||||
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
||||||
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
||||||
|
@ -18,7 +35,7 @@ class Lookups:
|
||||||
via doc.vocab.lookups.
|
via doc.vocab.lookups.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
"""Initialize the Lookups object.
|
"""Initialize the Lookups object.
|
||||||
|
|
||||||
RETURNS (Lookups): The newly created object.
|
RETURNS (Lookups): The newly created object.
|
||||||
|
@ -27,7 +44,7 @@ class Lookups:
|
||||||
"""
|
"""
|
||||||
self._tables = {}
|
self._tables = {}
|
||||||
|
|
||||||
def __contains__(self, name):
|
def __contains__(self, name: str) -> bool:
|
||||||
"""Check if the lookups contain a table of a given name. Delegates to
|
"""Check if the lookups contain a table of a given name. Delegates to
|
||||||
Lookups.has_table.
|
Lookups.has_table.
|
||||||
|
|
||||||
|
@ -36,16 +53,16 @@ class Lookups:
|
||||||
"""
|
"""
|
||||||
return self.has_table(name)
|
return self.has_table(name)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
"""RETURNS (int): The number of tables in the lookups."""
|
"""RETURNS (int): The number of tables in the lookups."""
|
||||||
return len(self._tables)
|
return len(self._tables)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tables(self):
|
def tables(self) -> List[str]:
|
||||||
"""RETURNS (list): Names of all tables in the lookups."""
|
"""RETURNS (List[str]): Names of all tables in the lookups."""
|
||||||
return list(self._tables.keys())
|
return list(self._tables.keys())
|
||||||
|
|
||||||
def add_table(self, name, data=SimpleFrozenDict()):
|
def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
|
||||||
"""Add a new table to the lookups. Raises an error if the table exists.
|
"""Add a new table to the lookups. Raises an error if the table exists.
|
||||||
|
|
||||||
name (str): Unique name of table.
|
name (str): Unique name of table.
|
||||||
|
@ -60,12 +77,12 @@ class Lookups:
|
||||||
self._tables[name] = table
|
self._tables[name] = table
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def get_table(self, name, default=UNSET):
|
def get_table(self, name: str, default: Any = UNSET) -> "Table":
|
||||||
"""Get a table. Raises an error if the table doesn't exist and no
|
"""Get a table. Raises an error if the table doesn't exist and no
|
||||||
default value is provided.
|
default value is provided.
|
||||||
|
|
||||||
name (str): Name of the table.
|
name (str): Name of the table.
|
||||||
default: Optional default value to return if table doesn't exist.
|
default (Any): Optional default value to return if table doesn't exist.
|
||||||
RETURNS (Table): The table.
|
RETURNS (Table): The table.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/lookups#get_table
|
DOCS: https://spacy.io/api/lookups#get_table
|
||||||
|
@ -76,7 +93,7 @@ class Lookups:
|
||||||
return default
|
return default
|
||||||
return self._tables[name]
|
return self._tables[name]
|
||||||
|
|
||||||
def remove_table(self, name):
|
def remove_table(self, name: str) -> "Table":
|
||||||
"""Remove a table. Raises an error if the table doesn't exist.
|
"""Remove a table. Raises an error if the table doesn't exist.
|
||||||
|
|
||||||
name (str): Name of the table to remove.
|
name (str): Name of the table to remove.
|
||||||
|
@ -88,7 +105,7 @@ class Lookups:
|
||||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||||
return self._tables.pop(name)
|
return self._tables.pop(name)
|
||||||
|
|
||||||
def has_table(self, name):
|
def has_table(self, name: str) -> bool:
|
||||||
"""Check if the lookups contain a table of a given name.
|
"""Check if the lookups contain a table of a given name.
|
||||||
|
|
||||||
name (str): Name of the table.
|
name (str): Name of the table.
|
||||||
|
@ -98,7 +115,7 @@ class Lookups:
|
||||||
"""
|
"""
|
||||||
return name in self._tables
|
return name in self._tables
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs) -> bytes:
|
||||||
"""Serialize the lookups to a bytestring.
|
"""Serialize the lookups to a bytestring.
|
||||||
|
|
||||||
RETURNS (bytes): The serialized Lookups.
|
RETURNS (bytes): The serialized Lookups.
|
||||||
|
@ -107,7 +124,7 @@ class Lookups:
|
||||||
"""
|
"""
|
||||||
return srsly.msgpack_dumps(self._tables)
|
return srsly.msgpack_dumps(self._tables)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **kwargs):
|
def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
|
||||||
"""Load the lookups from a bytestring.
|
"""Load the lookups from a bytestring.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load.
|
bytes_data (bytes): The data to load.
|
||||||
|
@ -120,7 +137,9 @@ class Lookups:
|
||||||
self._tables[key] = Table(key, value)
|
self._tables[key] = Table(key, value)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, filename="lookups.bin", **kwargs):
|
def to_disk(
|
||||||
|
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
||||||
|
) -> None:
|
||||||
"""Save the lookups to a directory as lookups.bin. Expects a path to a
|
"""Save the lookups to a directory as lookups.bin. Expects a path to a
|
||||||
directory, which will be created if it doesn't exist.
|
directory, which will be created if it doesn't exist.
|
||||||
|
|
||||||
|
@ -136,7 +155,9 @@ class Lookups:
|
||||||
with filepath.open("wb") as file_:
|
with filepath.open("wb") as file_:
|
||||||
file_.write(self.to_bytes())
|
file_.write(self.to_bytes())
|
||||||
|
|
||||||
def from_disk(self, path, filename="lookups.bin", **kwargs):
|
def from_disk(
|
||||||
|
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
||||||
|
) -> "Lookups":
|
||||||
"""Load lookups from a directory containing a lookups.bin. Will skip
|
"""Load lookups from a directory containing a lookups.bin. Will skip
|
||||||
loading if the file doesn't exist.
|
loading if the file doesn't exist.
|
||||||
|
|
||||||
|
@ -162,7 +183,7 @@ class Table(OrderedDict):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, data, name=None):
|
def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
|
||||||
"""Initialize a new table from a dict.
|
"""Initialize a new table from a dict.
|
||||||
|
|
||||||
data (dict): The dictionary.
|
data (dict): The dictionary.
|
||||||
|
@ -175,7 +196,7 @@ class Table(OrderedDict):
|
||||||
self.update(data)
|
self.update(data)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, name=None, data=None):
|
def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
|
||||||
"""Initialize a new table.
|
"""Initialize a new table.
|
||||||
|
|
||||||
name (str): Optional table name for reference.
|
name (str): Optional table name for reference.
|
||||||
|
@ -193,7 +214,7 @@ class Table(OrderedDict):
|
||||||
if data:
|
if data:
|
||||||
self.update(data)
|
self.update(data)
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key: Union[str, int], value: Any) -> None:
|
||||||
"""Set new key/value pair. String keys will be hashed.
|
"""Set new key/value pair. String keys will be hashed.
|
||||||
|
|
||||||
key (str / int): The key to set.
|
key (str / int): The key to set.
|
||||||
|
@ -203,7 +224,7 @@ class Table(OrderedDict):
|
||||||
OrderedDict.__setitem__(self, key, value)
|
OrderedDict.__setitem__(self, key, value)
|
||||||
self.bloom.add(key)
|
self.bloom.add(key)
|
||||||
|
|
||||||
def set(self, key, value):
|
def set(self, key: Union[str, int], value: Any) -> None:
|
||||||
"""Set new key/value pair. String keys will be hashed.
|
"""Set new key/value pair. String keys will be hashed.
|
||||||
Same as table[key] = value.
|
Same as table[key] = value.
|
||||||
|
|
||||||
|
@ -212,7 +233,7 @@ class Table(OrderedDict):
|
||||||
"""
|
"""
|
||||||
self[key] = value
|
self[key] = value
|
||||||
|
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key: Union[str, int]) -> Any:
|
||||||
"""Get the value for a given key. String keys will be hashed.
|
"""Get the value for a given key. String keys will be hashed.
|
||||||
|
|
||||||
key (str / int): The key to get.
|
key (str / int): The key to get.
|
||||||
|
@ -221,7 +242,7 @@ class Table(OrderedDict):
|
||||||
key = get_string_id(key)
|
key = get_string_id(key)
|
||||||
return OrderedDict.__getitem__(self, key)
|
return OrderedDict.__getitem__(self, key)
|
||||||
|
|
||||||
def get(self, key, default=None):
|
def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
|
||||||
"""Get the value for a given key. String keys will be hashed.
|
"""Get the value for a given key. String keys will be hashed.
|
||||||
|
|
||||||
key (str / int): The key to get.
|
key (str / int): The key to get.
|
||||||
|
@ -231,7 +252,7 @@ class Table(OrderedDict):
|
||||||
key = get_string_id(key)
|
key = get_string_id(key)
|
||||||
return OrderedDict.get(self, key, default)
|
return OrderedDict.get(self, key, default)
|
||||||
|
|
||||||
def __contains__(self, key):
|
def __contains__(self, key: Union[str, int]) -> bool:
|
||||||
"""Check whether a key is in the table. String keys will be hashed.
|
"""Check whether a key is in the table. String keys will be hashed.
|
||||||
|
|
||||||
key (str / int): The key to check.
|
key (str / int): The key to check.
|
||||||
|
@ -243,7 +264,7 @@ class Table(OrderedDict):
|
||||||
return False
|
return False
|
||||||
return OrderedDict.__contains__(self, key)
|
return OrderedDict.__contains__(self, key)
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self) -> bytes:
|
||||||
"""Serialize table to a bytestring.
|
"""Serialize table to a bytestring.
|
||||||
|
|
||||||
RETURNS (bytes): The serialized table.
|
RETURNS (bytes): The serialized table.
|
||||||
|
@ -257,7 +278,7 @@ class Table(OrderedDict):
|
||||||
}
|
}
|
||||||
return srsly.msgpack_dumps(data)
|
return srsly.msgpack_dumps(data)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data):
|
def from_bytes(self, bytes_data: bytes) -> "Table":
|
||||||
"""Load a table from a bytestring.
|
"""Load a table from a bytestring.
|
||||||
|
|
||||||
bytes_data (bytes): The data to load.
|
bytes_data (bytes): The data to load.
|
||||||
|
|
|
@ -26,8 +26,8 @@ def build_nel_encoder(tok2vec, nO=None):
|
||||||
|
|
||||||
|
|
||||||
@registry.assets.register("spacy.KBFromFile.v1")
|
@registry.assets.register("spacy.KBFromFile.v1")
|
||||||
def load_kb(nlp_path, kb_path) -> KnowledgeBase:
|
def load_kb(vocab_path, kb_path) -> KnowledgeBase:
|
||||||
vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
|
vocab = Vocab().from_disk(vocab_path)
|
||||||
kb = KnowledgeBase(vocab=vocab)
|
kb = KnowledgeBase(vocab=vocab)
|
||||||
kb.load_bulk(kb_path)
|
kb.load_bulk(kb_path)
|
||||||
return kb
|
return kb
|
||||||
|
|
|
@ -1,30 +1,9 @@
|
||||||
from thinc.api import (
|
from typing import Optional
|
||||||
Model,
|
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
reduce_mean,
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
Linear,
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
list2ragged,
|
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
|
||||||
Logistic,
|
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
||||||
ParametricAttention,
|
|
||||||
)
|
|
||||||
from thinc.api import chain, concatenate, clone, Dropout
|
|
||||||
from thinc.api import (
|
|
||||||
SparseLinear,
|
|
||||||
Softmax,
|
|
||||||
softmax_activation,
|
|
||||||
Maxout,
|
|
||||||
reduce_sum,
|
|
||||||
Relu,
|
|
||||||
residual,
|
|
||||||
expand_window,
|
|
||||||
)
|
|
||||||
from thinc.api import (
|
|
||||||
HashEmbed,
|
|
||||||
with_ragged,
|
|
||||||
with_array,
|
|
||||||
with_cpu,
|
|
||||||
uniqued,
|
|
||||||
FeatureExtractor,
|
|
||||||
)
|
|
||||||
|
|
||||||
from ..spacy_vectors import SpacyVectors
|
from ..spacy_vectors import SpacyVectors
|
||||||
from ... import util
|
from ... import util
|
||||||
|
@ -34,7 +13,9 @@ from ..extract_ngrams import extract_ngrams
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||||
def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
|
def build_simple_cnn_text_classifier(
|
||||||
|
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
||||||
|
) -> Model:
|
||||||
"""
|
"""
|
||||||
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
||||||
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
||||||
|
@ -90,13 +71,25 @@ def build_text_classifier(
|
||||||
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
|
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
|
||||||
)
|
)
|
||||||
prefix = HashEmbed(
|
prefix = HashEmbed(
|
||||||
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
|
nO=width // 2,
|
||||||
|
nV=embed_size,
|
||||||
|
column=cols.index(PREFIX),
|
||||||
|
dropout=dropout,
|
||||||
|
seed=11,
|
||||||
)
|
)
|
||||||
suffix = HashEmbed(
|
suffix = HashEmbed(
|
||||||
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
|
nO=width // 2,
|
||||||
|
nV=embed_size,
|
||||||
|
column=cols.index(SUFFIX),
|
||||||
|
dropout=dropout,
|
||||||
|
seed=12,
|
||||||
)
|
)
|
||||||
shape = HashEmbed(
|
shape = HashEmbed(
|
||||||
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
|
nO=width // 2,
|
||||||
|
nV=embed_size,
|
||||||
|
column=cols.index(SHAPE),
|
||||||
|
dropout=dropout,
|
||||||
|
seed=13,
|
||||||
)
|
)
|
||||||
|
|
||||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||||
|
|
|
@ -11,8 +11,8 @@ from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2VecTensors.v1")
|
@registry.architectures.register("spacy.Tok2VecTensors.v1")
|
||||||
def tok2vec_tensors_v1(width):
|
def tok2vec_tensors_v1(width, upstream="*"):
|
||||||
tok2vec = Tok2VecListener("tok2vec", width=width)
|
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,30 +1,37 @@
|
||||||
|
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .tokens import Doc, Token, Span
|
from .tokens import Doc, Token, Span
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
|
from .util import dot_to_dict
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
from .language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
def analyze_pipes(
|
||||||
|
nlp: "Language", name: str, index: int, warn: bool = True
|
||||||
|
) -> List[str]:
|
||||||
"""Analyze a pipeline component with respect to its position in the current
|
"""Analyze a pipeline component with respect to its position in the current
|
||||||
pipeline and the other components. Will check whether requirements are
|
pipeline and the other components. Will check whether requirements are
|
||||||
fulfilled (e.g. if previous components assign the attributes).
|
fulfilled (e.g. if previous components assign the attributes).
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
nlp (Language): The current nlp object.
|
||||||
name (str): The name of the pipeline component to analyze.
|
name (str): The name of the pipeline component to analyze.
|
||||||
pipe (callable): The pipeline component function to analyze.
|
|
||||||
index (int): The index of the component in the pipeline.
|
index (int): The index of the component in the pipeline.
|
||||||
warn (bool): Show user warning if problem is found.
|
warn (bool): Show user warning if problem is found.
|
||||||
RETURNS (list): The problems found for the given pipeline component.
|
RETURNS (List[str]): The problems found for the given pipeline component.
|
||||||
"""
|
"""
|
||||||
assert pipeline[index][0] == name
|
assert nlp.pipeline[index][0] == name
|
||||||
prev_pipes = pipeline[:index]
|
prev_pipes = nlp.pipeline[:index]
|
||||||
pipe_requires = getattr(pipe, "requires", [])
|
meta = nlp.get_pipe_meta(name)
|
||||||
requires = {annot: False for annot in pipe_requires}
|
requires = {annot: False for annot in meta.requires}
|
||||||
if requires:
|
if requires:
|
||||||
for prev_name, prev_pipe in prev_pipes:
|
for prev_name, prev_pipe in prev_pipes:
|
||||||
prev_assigns = getattr(prev_pipe, "assigns", [])
|
prev_meta = nlp.get_pipe_meta(prev_name)
|
||||||
for annot in prev_assigns:
|
for annot in prev_meta.assigns:
|
||||||
requires[annot] = True
|
requires[annot] = True
|
||||||
problems = []
|
problems = []
|
||||||
for annot, fulfilled in requires.items():
|
for annot, fulfilled in requires.items():
|
||||||
|
@ -35,46 +42,29 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
||||||
return problems
|
return problems
|
||||||
|
|
||||||
|
|
||||||
def analyze_all_pipes(pipeline, warn=True):
|
def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
|
||||||
"""Analyze all pipes in the pipeline in order.
|
"""Analyze all pipes in the pipeline in order.
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
nlp (Language): The current nlp object.
|
||||||
warn (bool): Show user warning if problem is found.
|
warn (bool): Show user warning if problem is found.
|
||||||
RETURNS (dict): The problems found, keyed by component name.
|
RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
|
||||||
"""
|
"""
|
||||||
problems = {}
|
problems = {}
|
||||||
for i, (name, pipe) in enumerate(pipeline):
|
for i, name in enumerate(nlp.pipe_names):
|
||||||
problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
|
problems[name] = analyze_pipes(nlp, name, i, warn=warn)
|
||||||
return problems
|
return problems
|
||||||
|
|
||||||
|
|
||||||
def dot_to_dict(values):
|
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
|
||||||
"""Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
|
|
||||||
become {"token": {"pos": True, "_": {"xyz": True }}}.
|
|
||||||
|
|
||||||
values (iterable): The values to convert.
|
|
||||||
RETURNS (dict): The converted values.
|
|
||||||
"""
|
|
||||||
result = {}
|
|
||||||
for value in values:
|
|
||||||
path = result
|
|
||||||
parts = value.lower().split(".")
|
|
||||||
for i, item in enumerate(parts):
|
|
||||||
is_last = i == len(parts) - 1
|
|
||||||
path = path.setdefault(item, True if is_last else {})
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def validate_attrs(values):
|
|
||||||
"""Validate component attributes provided to "assigns", "requires" etc.
|
"""Validate component attributes provided to "assigns", "requires" etc.
|
||||||
Raises error for invalid attributes and formatting. Doesn't check if
|
Raises error for invalid attributes and formatting. Doesn't check if
|
||||||
custom extension attributes are registered, since this is something the
|
custom extension attributes are registered, since this is something the
|
||||||
user might want to do themselves later in the component.
|
user might want to do themselves later in the component.
|
||||||
|
|
||||||
values (iterable): The string attributes to check, e.g. `["token.pos"]`.
|
values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`.
|
||||||
RETURNS (iterable): The checked attributes.
|
RETURNS (Iterable[str]): The checked attributes.
|
||||||
"""
|
"""
|
||||||
data = dot_to_dict(values)
|
data = dot_to_dict({value: True for value in values})
|
||||||
objs = {"doc": Doc, "token": Token, "span": Span}
|
objs = {"doc": Doc, "token": Token, "span": Span}
|
||||||
for obj_key, attrs in data.items():
|
for obj_key, attrs in data.items():
|
||||||
if obj_key == "span":
|
if obj_key == "span":
|
||||||
|
@ -111,37 +101,40 @@ def validate_attrs(values):
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
def _get_feature_for_attr(pipeline, attr, feature):
|
def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
|
||||||
assert feature in ["assigns", "requires"]
|
assert feature in ["assigns", "requires"]
|
||||||
result = []
|
result = []
|
||||||
for pipe_name, pipe in pipeline:
|
for pipe_name in nlp.pipe_names:
|
||||||
pipe_assigns = getattr(pipe, feature, [])
|
meta = nlp.get_pipe_meta(pipe_name)
|
||||||
|
pipe_assigns = getattr(meta, feature, [])
|
||||||
if attr in pipe_assigns:
|
if attr in pipe_assigns:
|
||||||
result.append((pipe_name, pipe))
|
result.append(pipe_name)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_assigns_for_attr(pipeline, attr):
|
def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
|
||||||
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
pipeline (Language): The current nlp object.
|
||||||
attr (str): The attribute to check.
|
attr (str): The attribute to check.
|
||||||
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
|
RETURNS (List[str]): Names of components that require the attr.
|
||||||
"""
|
"""
|
||||||
return _get_feature_for_attr(pipeline, attr, "assigns")
|
return _get_feature_for_attr(nlp, attr, "assigns")
|
||||||
|
|
||||||
|
|
||||||
def get_requires_for_attr(pipeline, attr):
|
def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
|
||||||
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
||||||
|
|
||||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
pipeline (Language): The current nlp object.
|
||||||
attr (str): The attribute to check.
|
attr (str): The attribute to check.
|
||||||
RETURNS (list): (name, pipeline) tuples of components that require the attr.
|
RETURNS (List[str]): Names of components that require the attr.
|
||||||
"""
|
"""
|
||||||
return _get_feature_for_attr(pipeline, attr, "requires")
|
return _get_feature_for_attr(nlp, attr, "requires")
|
||||||
|
|
||||||
|
|
||||||
def print_summary(nlp, pretty=True, no_print=False):
|
def print_summary(
|
||||||
|
nlp: "Language", pretty: bool = True, no_print: bool = False
|
||||||
|
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
|
||||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||||
a table with the pipeline components and why they assign and require, as
|
a table with the pipeline components and why they assign and require, as
|
||||||
well as any problems if available.
|
well as any problems if available.
|
||||||
|
@ -154,12 +147,10 @@ def print_summary(nlp, pretty=True, no_print=False):
|
||||||
msg = Printer(pretty=pretty, no_print=no_print)
|
msg = Printer(pretty=pretty, no_print=no_print)
|
||||||
overview = []
|
overview = []
|
||||||
problems = {}
|
problems = {}
|
||||||
for i, (name, pipe) in enumerate(nlp.pipeline):
|
for i, name in enumerate(nlp.pipe_names):
|
||||||
requires = getattr(pipe, "requires", [])
|
meta = nlp.get_pipe_meta(name)
|
||||||
assigns = getattr(pipe, "assigns", [])
|
overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
|
||||||
retok = getattr(pipe, "retokenizes", False)
|
problems[name] = analyze_pipes(nlp, name, i, warn=False)
|
||||||
overview.append((i, name, requires, assigns, retok))
|
|
||||||
problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
|
|
||||||
msg.divider("Pipeline Overview")
|
msg.divider("Pipeline Overview")
|
||||||
header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
|
header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
|
||||||
msg.table(overview, header=header, divider=True, multiline=True)
|
msg.table(overview, header=header, divider=True, multiline=True)
|
||||||
|
@ -175,15 +166,19 @@ def print_summary(nlp, pretty=True, no_print=False):
|
||||||
return {"overview": overview, "problems": problems}
|
return {"overview": overview, "problems": problems}
|
||||||
|
|
||||||
|
|
||||||
def count_pipeline_interdependencies(pipeline):
|
def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
|
||||||
"""Count how many subsequent components require an annotation set by each
|
"""Count how many subsequent components require an annotation set by each
|
||||||
component in the pipeline.
|
component in the pipeline.
|
||||||
|
|
||||||
|
nlp (Language): The current nlp object.
|
||||||
|
RETURNS (List[int]): The interdependency counts.
|
||||||
"""
|
"""
|
||||||
pipe_assigns = []
|
pipe_assigns = []
|
||||||
pipe_requires = []
|
pipe_requires = []
|
||||||
for name, pipe in pipeline:
|
for name in nlp.pipe_names:
|
||||||
pipe_assigns.append(set(getattr(pipe, "assigns", [])))
|
meta = nlp.get_pipe_meta(name)
|
||||||
pipe_requires.append(set(getattr(pipe, "requires", [])))
|
pipe_assigns.append(set(meta.assigns))
|
||||||
|
pipe_requires.append(set(meta.requires))
|
||||||
counts = []
|
counts = []
|
||||||
for i, assigns in enumerate(pipe_assigns):
|
for i, assigns in enumerate(pipe_assigns):
|
||||||
count = 0
|
count = 0
|
||||||
|
|
|
@ -1,28 +1,33 @@
|
||||||
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
|
from .dep_parser import DependencyParser
|
||||||
from .pipes import TextCategorizer, Pipe, Sentencizer
|
from .entity_linker import EntityLinker
|
||||||
from .pipes import SentenceRecognizer
|
from .ner import EntityRecognizer
|
||||||
from .simple_ner import SimpleNER
|
|
||||||
from .morphologizer import Morphologizer
|
|
||||||
from .entityruler import EntityRuler
|
from .entityruler import EntityRuler
|
||||||
|
from .morphologizer import Morphologizer
|
||||||
|
from .pipe import Pipe
|
||||||
|
from spacy.pipeline.senter import SentenceRecognizer
|
||||||
|
from .sentencizer import Sentencizer
|
||||||
|
from .simple_ner import SimpleNER
|
||||||
|
from .tagger import Tagger
|
||||||
|
from .textcat import TextCategorizer
|
||||||
from .tok2vec import Tok2Vec
|
from .tok2vec import Tok2Vec
|
||||||
from .hooks import SentenceSegmenter, SimilarityHook
|
from .hooks import SentenceSegmenter, SimilarityHook
|
||||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Tagger",
|
|
||||||
"DependencyParser",
|
"DependencyParser",
|
||||||
"EntityRecognizer",
|
|
||||||
"EntityLinker",
|
"EntityLinker",
|
||||||
"TextCategorizer",
|
"EntityRecognizer",
|
||||||
"Tok2Vec",
|
|
||||||
"Pipe",
|
|
||||||
"Morphologizer",
|
|
||||||
"EntityRuler",
|
"EntityRuler",
|
||||||
"Sentencizer",
|
"Morphologizer",
|
||||||
"SentenceSegmenter",
|
"Pipe",
|
||||||
"SentenceRecognizer",
|
"SentenceRecognizer",
|
||||||
|
"SentenceSegmenter",
|
||||||
|
"Sentencizer",
|
||||||
"SimilarityHook",
|
"SimilarityHook",
|
||||||
"SimpleNER",
|
"SimpleNER",
|
||||||
|
"Tagger",
|
||||||
|
"TextCategorizer",
|
||||||
|
"Tok2Vec",
|
||||||
"merge_entities",
|
"merge_entities",
|
||||||
"merge_noun_chunks",
|
"merge_noun_chunks",
|
||||||
"merge_subtokens",
|
"merge_subtokens",
|
||||||
|
|
|
@ -1,93 +0,0 @@
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from ... import util
|
|
||||||
|
|
||||||
|
|
||||||
def default_nel_config():
|
|
||||||
loc = Path(__file__).parent / "entity_linker_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=False)
|
|
||||||
|
|
||||||
|
|
||||||
def default_nel():
|
|
||||||
loc = Path(__file__).parent / "entity_linker_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
|
||||||
|
|
||||||
|
|
||||||
def default_morphologizer_config():
|
|
||||||
loc = Path(__file__).parent / "morphologizer_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=False)
|
|
||||||
|
|
||||||
|
|
||||||
def default_morphologizer():
|
|
||||||
loc = Path(__file__).parent / "morphologizer_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
|
||||||
|
|
||||||
|
|
||||||
def default_parser_config():
|
|
||||||
loc = Path(__file__).parent / "parser_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=False)
|
|
||||||
|
|
||||||
|
|
||||||
def default_parser():
|
|
||||||
loc = Path(__file__).parent / "parser_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
|
||||||
|
|
||||||
|
|
||||||
def default_ner_config():
|
|
||||||
loc = Path(__file__).parent / "ner_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=False)
|
|
||||||
|
|
||||||
|
|
||||||
def default_ner():
|
|
||||||
loc = Path(__file__).parent / "ner_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
|
||||||
|
|
||||||
|
|
||||||
def default_senter_config():
|
|
||||||
loc = Path(__file__).parent / "senter_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=False)
|
|
||||||
|
|
||||||
|
|
||||||
def default_senter():
|
|
||||||
loc = Path(__file__).parent / "senter_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
|
||||||
|
|
||||||
|
|
||||||
def default_tagger_config():
|
|
||||||
loc = Path(__file__).parent / "tagger_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=False)
|
|
||||||
|
|
||||||
|
|
||||||
def default_tagger():
|
|
||||||
loc = Path(__file__).parent / "tagger_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
|
||||||
|
|
||||||
|
|
||||||
def default_textcat_config():
|
|
||||||
loc = Path(__file__).parent / "textcat_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=False)
|
|
||||||
|
|
||||||
|
|
||||||
def default_textcat():
|
|
||||||
loc = Path(__file__).parent / "textcat_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
|
||||||
|
|
||||||
|
|
||||||
def default_tok2vec_config():
|
|
||||||
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=False)
|
|
||||||
|
|
||||||
|
|
||||||
def default_tok2vec():
|
|
||||||
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
|
||||||
|
|
||||||
|
|
||||||
def default_simple_ner_config():
|
|
||||||
loc = Path(__file__).parent / "simple_ner_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=False)
|
|
||||||
|
|
||||||
|
|
||||||
def default_simple_ner():
|
|
||||||
loc = Path(__file__).parent / "simple_ner_defaults.cfg"
|
|
||||||
return util.load_config(loc, create_objects=True)["model"]
|
|
|
@ -1,13 +0,0 @@
|
||||||
[model]
|
|
||||||
@architectures = "spacy.EntityLinker.v1"
|
|
||||||
|
|
||||||
[model.tok2vec]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
pretrained_vectors = null
|
|
||||||
width = 96
|
|
||||||
depth = 2
|
|
||||||
embed_size = 300
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
||||||
subword_features = true
|
|
||||||
dropout = null
|
|
|
@ -1,14 +0,0 @@
|
||||||
[model]
|
|
||||||
@architectures = "spacy.Tagger.v1"
|
|
||||||
|
|
||||||
[model.tok2vec]
|
|
||||||
@architectures = "spacy.HashCharEmbedCNN.v1"
|
|
||||||
pretrained_vectors = null
|
|
||||||
width = 128
|
|
||||||
depth = 4
|
|
||||||
embed_size = 7000
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
||||||
nM = 64
|
|
||||||
nC = 8
|
|
||||||
dropout = null
|
|
|
@ -1,15 +0,0 @@
|
||||||
[model]
|
|
||||||
@architectures = "spacy.MultiTask.v1"
|
|
||||||
maxout_pieces = 3
|
|
||||||
token_vector_width = 96
|
|
||||||
|
|
||||||
[model.tok2vec]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
pretrained_vectors = null
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
embed_size = 2000
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 2
|
|
||||||
subword_features = true
|
|
||||||
dropout = null
|
|
|
@ -1,16 +0,0 @@
|
||||||
[model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
|
||||||
nr_feature_tokens = 6
|
|
||||||
hidden_width = 64
|
|
||||||
maxout_pieces = 2
|
|
||||||
|
|
||||||
[model.tok2vec]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
pretrained_vectors = null
|
|
||||||
width = 96
|
|
||||||
depth = 4
|
|
||||||
embed_size = 2000
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 3
|
|
||||||
subword_features = true
|
|
||||||
dropout = null
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user