mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Merge pull request #12752 from danieldk/maintenance/sync-v4-master-20230626
Sync `master` into `v4`
This commit is contained in:
		
						commit
						b615964be7
					
				
							
								
								
									
										4
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -37,6 +37,10 @@ jobs: | |||
|         run: | | ||||
|           python -m pip install black -c requirements.txt | ||||
|           python -m black spacy --check | ||||
|       - name: isort | ||||
|         run: | | ||||
|           python -m pip install isort -c requirements.txt | ||||
|           python -m isort spacy --check | ||||
|       - name: flake8 | ||||
|         run: | | ||||
|           python -m pip install flake8==5.0.4 | ||||
|  |  | |||
|  | @ -9,3 +9,6 @@ requires = [ | |||
|     "numpy>=1.15.0", | ||||
| ] | ||||
| build-backend = "setuptools.build_meta" | ||||
| 
 | ||||
| [tool.isort] | ||||
| profile = "black" | ||||
|  |  | |||
|  | @ -36,3 +36,4 @@ types-setuptools>=57.0.0 | |||
| types-requests | ||||
| types-setuptools>=57.0.0 | ||||
| black==22.3.0 | ||||
| isort>=5.0,<6.0 | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from typing import Union, Iterable, Dict, Any | ||||
| from pathlib import Path | ||||
| import sys | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Iterable, Union | ||||
| 
 | ||||
| # set library-specific custom warning handling before doing anything else | ||||
| from .errors import setup_default_warnings | ||||
|  | @ -8,20 +8,17 @@ from .errors import setup_default_warnings | |||
| setup_default_warnings()  # noqa: E402 | ||||
| 
 | ||||
| # These are imported as part of the API | ||||
| from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401 | ||||
| from thinc.api import Config | ||||
| from thinc.api import Config, prefer_gpu, require_cpu, require_gpu  # noqa: F401 | ||||
| 
 | ||||
| from . import pipeline  # noqa: F401 | ||||
| from .cli.info import info  # noqa: F401 | ||||
| from .glossary import explain  # noqa: F401 | ||||
| from .about import __version__  # noqa: F401 | ||||
| from .util import registry, logger  # noqa: F401 | ||||
| 
 | ||||
| from .errors import Errors | ||||
| from .language import Language | ||||
| from .vocab import Vocab | ||||
| from . import util | ||||
| 
 | ||||
| from .about import __version__  # noqa: F401 | ||||
| from .cli.info import info  # noqa: F401 | ||||
| from .errors import Errors | ||||
| from .glossary import explain  # noqa: F401 | ||||
| from .language import Language | ||||
| from .util import logger, registry  # noqa: F401 | ||||
| from .vocab import Vocab | ||||
| 
 | ||||
| if sys.maxunicode == 65535: | ||||
|     raise SystemError(Errors.E130) | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| from . cimport symbols | ||||
| 
 | ||||
| 
 | ||||
| cdef enum attr_id_t: | ||||
|     NULL_ATTR = 0 | ||||
|     IS_ALPHA = symbols.IS_ALPHA | ||||
|  |  | |||
|  | @ -1,35 +1,35 @@ | |||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, setup_cli  # noqa: F401 | ||||
| from .apply import apply  # noqa: F401 | ||||
| from .assemble import assemble_cli  # noqa: F401 | ||||
| 
 | ||||
| # These are the actual functions, NOT the wrapped CLI commands. The CLI commands | ||||
| # are registered automatically and won't have to be imported here. | ||||
| from .benchmark_speed import benchmark_speed_cli  # noqa: F401 | ||||
| from .download import download  # noqa: F401 | ||||
| from .info import info  # noqa: F401 | ||||
| from .package import package  # noqa: F401 | ||||
| from .profile import profile  # noqa: F401 | ||||
| from .train import train_cli  # noqa: F401 | ||||
| from .assemble import assemble_cli  # noqa: F401 | ||||
| from .pretrain import pretrain  # noqa: F401 | ||||
| from .debug_data import debug_data  # noqa: F401 | ||||
| from .debug_config import debug_config  # noqa: F401 | ||||
| from .debug_model import debug_model  # noqa: F401 | ||||
| from .debug_diff import debug_diff  # noqa: F401 | ||||
| from .evaluate import evaluate  # noqa: F401 | ||||
| from .apply import apply  # noqa: F401 | ||||
| from .convert import convert  # noqa: F401 | ||||
| from .init_pipeline import init_pipeline_cli  # noqa: F401 | ||||
| from .init_config import init_config, fill_config  # noqa: F401 | ||||
| from .validate import validate  # noqa: F401 | ||||
| from .project.clone import project_clone  # noqa: F401 | ||||
| from .project.assets import project_assets  # noqa: F401 | ||||
| from .project.run import project_run  # noqa: F401 | ||||
| from .project.dvc import project_update_dvc  # noqa: F401 | ||||
| from .project.push import project_push  # noqa: F401 | ||||
| from .project.pull import project_pull  # noqa: F401 | ||||
| from .project.document import project_document  # noqa: F401 | ||||
| from .debug_config import debug_config  # noqa: F401 | ||||
| from .debug_data import debug_data  # noqa: F401 | ||||
| from .debug_diff import debug_diff  # noqa: F401 | ||||
| from .debug_model import debug_model  # noqa: F401 | ||||
| from .download import download  # noqa: F401 | ||||
| from .evaluate import evaluate  # noqa: F401 | ||||
| from .find_threshold import find_threshold  # noqa: F401 | ||||
| from .info import info  # noqa: F401 | ||||
| from .init_config import fill_config, init_config  # noqa: F401 | ||||
| from .init_pipeline import init_pipeline_cli  # noqa: F401 | ||||
| from .package import package  # noqa: F401 | ||||
| from .pretrain import pretrain  # noqa: F401 | ||||
| from .profile import profile  # noqa: F401 | ||||
| from .project.assets import project_assets  # noqa: F401 | ||||
| from .project.clone import project_clone  # noqa: F401 | ||||
| from .project.document import project_document  # noqa: F401 | ||||
| from .project.dvc import project_update_dvc  # noqa: F401 | ||||
| from .project.pull import project_pull  # noqa: F401 | ||||
| from .project.push import project_push  # noqa: F401 | ||||
| from .project.run import project_run  # noqa: F401 | ||||
| from .train import train_cli  # noqa: F401 | ||||
| from .validate import validate  # noqa: F401 | ||||
| 
 | ||||
| 
 | ||||
| @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | ||||
|  |  | |||
|  | @ -1,26 +1,45 @@ | |||
| from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal | ||||
| from typing import TYPE_CHECKING, overload | ||||
| import sys | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from wasabi import msg, Printer | ||||
| import srsly | ||||
| import hashlib | ||||
| import os | ||||
| import shutil | ||||
| import sys | ||||
| from configparser import InterpolationError | ||||
| from contextlib import contextmanager | ||||
| from pathlib import Path | ||||
| from typing import ( | ||||
|     TYPE_CHECKING, | ||||
|     Any, | ||||
|     Dict, | ||||
|     Iterable, | ||||
|     List, | ||||
|     Literal, | ||||
|     Optional, | ||||
|     Tuple, | ||||
|     Union, | ||||
|     overload, | ||||
| ) | ||||
| 
 | ||||
| import srsly | ||||
| import typer | ||||
| from click import NoSuchOption | ||||
| from click.parser import split_arg_string | ||||
| from typer.main import get_command | ||||
| from contextlib import contextmanager | ||||
| from thinc.api import Config, ConfigValidationError, require_gpu | ||||
| from thinc.util import gpu_is_available | ||||
| from configparser import InterpolationError | ||||
| import os | ||||
| from typer.main import get_command | ||||
| from wasabi import Printer, msg | ||||
| 
 | ||||
| from ..schemas import ProjectConfigSchema, validate | ||||
| from ..util import import_file, run_command, make_tempdir, registry, logger | ||||
| from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS | ||||
| from ..errors import RENAMED_LANGUAGE_CODES | ||||
| from .. import about | ||||
| from ..errors import RENAMED_LANGUAGE_CODES | ||||
| from ..schemas import ProjectConfigSchema, validate | ||||
| from ..util import ( | ||||
|     ENV_VARS, | ||||
|     SimpleFrozenDict, | ||||
|     import_file, | ||||
|     is_compatible_version, | ||||
|     logger, | ||||
|     make_tempdir, | ||||
|     registry, | ||||
|     run_command, | ||||
| ) | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from pathy import FluidPath  # noqa: F401 | ||||
|  |  | |||
|  | @ -1,18 +1,15 @@ | |||
| import tqdm | ||||
| import srsly | ||||
| 
 | ||||
| from itertools import chain | ||||
| from pathlib import Path | ||||
| from typing import Optional, List, Iterable, cast, Union | ||||
| from typing import Iterable, List, Optional, Union, cast | ||||
| 
 | ||||
| import srsly | ||||
| import tqdm | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory | ||||
| 
 | ||||
| from ..tokens import Doc, DocBin | ||||
| from ..vocab import Vocab | ||||
| from ..util import ensure_path, load_model | ||||
| 
 | ||||
| from ..vocab import Vocab | ||||
| from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory | ||||
| 
 | ||||
| path_help = """Location of the documents to predict on. | ||||
| Can be a single file in .spacy format or a .jsonl file. | ||||
|  |  | |||
|  | @ -1,13 +1,20 @@ | |||
| from typing import Optional | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import typer | ||||
| import logging | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error | ||||
| from ._util import import_code | ||||
| from .. import util | ||||
| from ..util import get_sourced_components, load_model_from_config | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     app, | ||||
|     import_code, | ||||
|     parse_config_overrides, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @app.command( | ||||
|  |  | |||
|  | @ -1,11 +1,12 @@ | |||
| from typing import Iterable, List, Optional | ||||
| import random | ||||
| from itertools import islice | ||||
| import numpy | ||||
| from pathlib import Path | ||||
| import time | ||||
| from tqdm import tqdm | ||||
| from itertools import islice | ||||
| from pathlib import Path | ||||
| from typing import Iterable, List, Optional | ||||
| 
 | ||||
| import numpy | ||||
| import typer | ||||
| from tqdm import tqdm | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .. import util | ||||
|  |  | |||
|  | @ -1,18 +1,22 @@ | |||
| from typing import Callable, Iterable, Mapping, Optional, Any, Union | ||||
| from enum import Enum | ||||
| from pathlib import Path | ||||
| from wasabi import Printer | ||||
| import srsly | ||||
| import itertools | ||||
| import re | ||||
| import sys | ||||
| import itertools | ||||
| from enum import Enum | ||||
| from pathlib import Path | ||||
| from typing import Any, Callable, Iterable, Mapping, Optional, Union | ||||
| 
 | ||||
| import srsly | ||||
| from wasabi import Printer | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory | ||||
| from ..training import docs_to_json | ||||
| from ..tokens import Doc, DocBin | ||||
| from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs | ||||
| from ..training.converters import conllu_to_docs | ||||
| 
 | ||||
| from ..training import docs_to_json | ||||
| from ..training.converters import ( | ||||
|     conll_ner_to_docs, | ||||
|     conllu_to_docs, | ||||
|     iob_to_docs, | ||||
|     json_to_docs, | ||||
| ) | ||||
| from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory | ||||
| 
 | ||||
| # Converters are matched by file extension except for ner/iob, which are | ||||
| # matched by file extension and content. To add a converter, add a new | ||||
|  |  | |||
|  | @ -1,15 +1,22 @@ | |||
| from typing import Optional, Dict, Any, Union, List | ||||
| from pathlib import Path | ||||
| from wasabi import msg, table | ||||
| from typing import Any, Dict, List, Optional, Union | ||||
| 
 | ||||
| import typer | ||||
| from thinc.api import Config | ||||
| from thinc.config import VARIABLE_RE | ||||
| import typer | ||||
| from wasabi import msg, table | ||||
| 
 | ||||
| from ._util import Arg, Opt, show_validation_error, parse_config_overrides | ||||
| from ._util import import_code, debug_cli | ||||
| from .. import util | ||||
| from ..schemas import ConfigSchemaInit, ConfigSchemaTraining | ||||
| from ..util import registry | ||||
| from .. import util | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     debug_cli, | ||||
|     import_code, | ||||
|     parse_config_overrides, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @debug_cli.command( | ||||
|  |  | |||
|  | @ -1,30 +1,49 @@ | |||
| from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union | ||||
| from typing import Literal, cast, overload | ||||
| from pathlib import Path | ||||
| from collections import Counter | ||||
| import sys | ||||
| import srsly | ||||
| from wasabi import Printer, MESSAGES, msg | ||||
| import typer | ||||
| import math | ||||
| import numpy | ||||
| import sys | ||||
| from collections import Counter | ||||
| from pathlib import Path | ||||
| from typing import ( | ||||
|     Any, | ||||
|     Dict, | ||||
|     Iterable, | ||||
|     List, | ||||
|     Literal, | ||||
|     Optional, | ||||
|     Sequence, | ||||
|     Set, | ||||
|     Tuple, | ||||
|     Union, | ||||
|     cast, | ||||
|     overload, | ||||
| ) | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides | ||||
| from ._util import import_code, debug_cli, _format_number | ||||
| from ..training import Example, remove_bilu_prefix | ||||
| from ..training.initialize import get_sourced_components | ||||
| from ..schemas import ConfigSchemaTraining | ||||
| from ..pipeline import TrainablePipe | ||||
| import numpy | ||||
| import srsly | ||||
| import typer | ||||
| from wasabi import MESSAGES, Printer, msg | ||||
| 
 | ||||
| from .. import util | ||||
| from ..language import Language | ||||
| from ..morphology import Morphology | ||||
| from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe | ||||
| from ..pipeline._edit_tree_internals.edit_trees import EditTrees | ||||
| from ..pipeline._parser_internals import nonproj | ||||
| from ..pipeline._parser_internals.nonproj import DELIMITER | ||||
| from ..pipeline import Morphologizer, SpanCategorizer | ||||
| from ..pipeline._edit_tree_internals.edit_trees import EditTrees | ||||
| from ..morphology import Morphology | ||||
| from ..language import Language | ||||
| from ..schemas import ConfigSchemaTraining | ||||
| from ..training import Example, remove_bilu_prefix | ||||
| from ..training.initialize import get_sourced_components | ||||
| from ..util import registry, resolve_dot_names | ||||
| from ..vectors import Mode as VectorsMode | ||||
| from .. import util | ||||
| 
 | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     _format_number, | ||||
|     app, | ||||
|     debug_cli, | ||||
|     import_code, | ||||
|     parse_config_overrides, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| # Minimum number of expected occurrences of NER label in data to train new label | ||||
| NEW_LABEL_THRESHOLD = 50 | ||||
|  | @ -211,7 +230,7 @@ def debug_data( | |||
|     else: | ||||
|         msg.info("No word vectors present in the package") | ||||
| 
 | ||||
|     if "spancat" in factory_names: | ||||
|     if "spancat" in factory_names or "spancat_singlelabel" in factory_names: | ||||
|         model_labels_spancat = _get_labels_from_spancat(nlp) | ||||
|         has_low_data_warning = False | ||||
|         has_no_neg_warning = False | ||||
|  | @ -829,7 +848,7 @@ def _compile_gold( | |||
|                     data["boundary_cross_ents"] += 1 | ||||
|                 elif label == "-": | ||||
|                     data["ner"]["-"] += 1 | ||||
|         if "spancat" in factory_names: | ||||
|         if "spancat" in factory_names or "spancat_singlelabel" in factory_names: | ||||
|             for spans_key in list(eg.reference.spans.keys()): | ||||
|                 # Obtain the span frequency | ||||
|                 if spans_key not in data["spancat"]: | ||||
|  | @ -1027,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]: | |||
|     pipe_names = [ | ||||
|         pipe_name | ||||
|         for pipe_name in nlp.pipe_names | ||||
|         if nlp.get_pipe_meta(pipe_name).factory == "spancat" | ||||
|         if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel") | ||||
|     ] | ||||
|     labels: Dict[str, Set[str]] = {} | ||||
|     for pipe_name in pipe_names: | ||||
|  |  | |||
|  | @ -1,13 +1,13 @@ | |||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| import typer | ||||
| from wasabi import Printer, diff_strings, MarkdownRenderer | ||||
| from pathlib import Path | ||||
| from thinc.api import Config | ||||
| from wasabi import MarkdownRenderer, Printer, diff_strings | ||||
| 
 | ||||
| from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides | ||||
| from ..util import load_config | ||||
| from .init_config import init_config, Optimizations | ||||
| from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error | ||||
| from .init_config import Optimizations, init_config | ||||
| 
 | ||||
| 
 | ||||
| @debug_cli.command( | ||||
|  |  | |||
|  | @ -1,19 +1,32 @@ | |||
| from typing import Dict, Any, Optional | ||||
| from pathlib import Path | ||||
| import itertools | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Optional | ||||
| 
 | ||||
| import typer | ||||
| from thinc.api import ( | ||||
|     Model, | ||||
|     data_validation, | ||||
|     fix_random_seed, | ||||
|     set_dropout_rate, | ||||
|     set_gpu_allocator, | ||||
| ) | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from spacy.training import Example | ||||
| from spacy.util import resolve_dot_names | ||||
| from wasabi import msg | ||||
| from thinc.api import fix_random_seed, set_dropout_rate | ||||
| from thinc.api import Model, data_validation, set_gpu_allocator | ||||
| import typer | ||||
| 
 | ||||
| from ._util import Arg, Opt, debug_cli, show_validation_error | ||||
| from ._util import parse_config_overrides, string_to_list, setup_gpu | ||||
| from .. import util | ||||
| from ..schemas import ConfigSchemaTraining | ||||
| from ..util import registry | ||||
| from .. import util | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     debug_cli, | ||||
|     parse_config_overrides, | ||||
|     setup_gpu, | ||||
|     show_validation_error, | ||||
|     string_to_list, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @debug_cli.command( | ||||
|  |  | |||
|  | @ -1,14 +1,20 @@ | |||
| from typing import Optional, Sequence | ||||
| import requests | ||||
| import sys | ||||
| from wasabi import msg | ||||
| import typer | ||||
| from typing import Optional, Sequence | ||||
| 
 | ||||
| import requests | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX | ||||
| from .. import about | ||||
| from ..util import is_package, get_minor_version, run_command | ||||
| from ..util import is_prerelease_version, get_installed_models | ||||
| from ..util import get_package_version | ||||
| from ..util import ( | ||||
|     get_installed_models, | ||||
|     get_minor_version, | ||||
|     get_package_version, | ||||
|     is_package, | ||||
|     is_prerelease_version, | ||||
|     run_command, | ||||
| ) | ||||
| from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app | ||||
| 
 | ||||
| 
 | ||||
| @app.command( | ||||
|  |  | |||
|  | @ -1,16 +1,16 @@ | |||
| from typing import Optional, List, Dict, Any, Union | ||||
| from wasabi import Printer | ||||
| from pathlib import Path | ||||
| import re | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, List, Optional, Union | ||||
| 
 | ||||
| import srsly | ||||
| from thinc.api import fix_random_seed | ||||
| from wasabi import Printer | ||||
| 
 | ||||
| from ..training import Corpus | ||||
| from ..tokens import Doc | ||||
| from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli | ||||
| from .. import displacy, util | ||||
| from ..scorer import Scorer | ||||
| from .. import util | ||||
| from .. import displacy | ||||
| from ..tokens import Doc | ||||
| from ..training import Corpus | ||||
| from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu | ||||
| 
 | ||||
| 
 | ||||
| @benchmark_cli.command( | ||||
|  |  | |||
|  | @ -1,17 +1,17 @@ | |||
| import functools | ||||
| import logging | ||||
| import operator | ||||
| from pathlib import Path | ||||
| import logging | ||||
| from typing import Optional, Tuple, Any, Dict, List | ||||
| from typing import Any, Dict, List, Optional, Tuple | ||||
| 
 | ||||
| import numpy | ||||
| import wasabi.tables | ||||
| 
 | ||||
| from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer | ||||
| from ..errors import Errors | ||||
| from ..training import Corpus | ||||
| from ._util import app, Arg, Opt, import_code, setup_gpu | ||||
| from .. import util | ||||
| from ..errors import Errors | ||||
| from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer | ||||
| from ..training import Corpus | ||||
| from ._util import Arg, Opt, app, import_code, setup_gpu | ||||
| 
 | ||||
| _DEFAULTS = { | ||||
|     "n_trials": 11, | ||||
|  |  | |||
|  | @ -1,15 +1,15 @@ | |||
| from typing import Optional, Dict, Any, Union, List | ||||
| import platform | ||||
| import json | ||||
| from pathlib import Path | ||||
| from wasabi import Printer, MarkdownRenderer | ||||
| import srsly | ||||
| import importlib.metadata | ||||
| import json | ||||
| import platform | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, List, Optional, Union | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, string_to_list | ||||
| from .download import get_model_filename, get_latest_version | ||||
| from .. import util | ||||
| from .. import about | ||||
| import srsly | ||||
| from wasabi import MarkdownRenderer, Printer | ||||
| 
 | ||||
| from .. import about, util | ||||
| from ._util import Arg, Opt, app, string_to_list | ||||
| from .download import get_latest_version, get_model_filename | ||||
| 
 | ||||
| 
 | ||||
| @app.command("info") | ||||
|  |  | |||
|  | @ -1,19 +1,27 @@ | |||
| from typing import Optional, List, Tuple | ||||
| import re | ||||
| from enum import Enum | ||||
| from pathlib import Path | ||||
| from wasabi import Printer, diff_strings | ||||
| from thinc.api import Config | ||||
| from typing import List, Optional, Tuple | ||||
| 
 | ||||
| import srsly | ||||
| import re | ||||
| from jinja2 import Template | ||||
| from thinc.api import Config | ||||
| from wasabi import Printer, diff_strings | ||||
| 
 | ||||
| from .. import util | ||||
| from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH | ||||
| from ..schemas import RecommendationSchema | ||||
| from ..util import SimpleFrozenList | ||||
| from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND | ||||
| from ._util import string_to_list, import_code, _handle_renamed_language_codes | ||||
| 
 | ||||
| from ._util import ( | ||||
|     COMMAND, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     _handle_renamed_language_codes, | ||||
|     import_code, | ||||
|     init_cli, | ||||
|     show_validation_error, | ||||
|     string_to_list, | ||||
| ) | ||||
| 
 | ||||
| ROOT = Path(__file__).parent / "templates" | ||||
| TEMPLATE_PATH = ROOT / "quickstart_training.jinja" | ||||
|  |  | |||
|  | @ -1,15 +1,24 @@ | |||
| from typing import Optional | ||||
| import logging | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import typer | ||||
| from typing import Optional | ||||
| 
 | ||||
| import srsly | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .. import util | ||||
| from ..training.initialize import init_nlp, convert_vectors | ||||
| from ..language import Language | ||||
| from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error | ||||
| from ._util import import_code, setup_gpu, _handle_renamed_language_codes | ||||
| from ..training.initialize import convert_vectors, init_nlp | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     _handle_renamed_language_codes, | ||||
|     import_code, | ||||
|     init_cli, | ||||
|     parse_config_overrides, | ||||
|     setup_gpu, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @init_cli.command("vectors") | ||||
|  |  | |||
|  | @ -1,18 +1,18 @@ | |||
| from typing import Optional, Union, Any, Dict, List, Tuple, cast | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from wasabi import Printer, MarkdownRenderer, get_raw_input | ||||
| from thinc.api import Config | ||||
| from collections import defaultdict | ||||
| from catalogue import RegistryError | ||||
| import srsly | ||||
| import sys | ||||
| import re | ||||
| import shutil | ||||
| import sys | ||||
| from collections import defaultdict | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, List, Optional, Tuple, Union, cast | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX | ||||
| from ..schemas import validate, ModelMetaSchema | ||||
| from .. import util | ||||
| from .. import about | ||||
| import srsly | ||||
| from catalogue import RegistryError | ||||
| from thinc.api import Config | ||||
| from wasabi import MarkdownRenderer, Printer, get_raw_input | ||||
| 
 | ||||
| from .. import about, util | ||||
| from ..schemas import ModelMetaSchema, validate | ||||
| from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list | ||||
| 
 | ||||
| 
 | ||||
| @app.command("package") | ||||
|  |  | |||
|  | @ -1,13 +1,21 @@ | |||
| from typing import Optional | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import typer | ||||
| import re | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error | ||||
| from ._util import import_code, setup_gpu | ||||
| from ..training.pretrain import pretrain | ||||
| from ..util import load_config | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     app, | ||||
|     import_code, | ||||
|     parse_config_overrides, | ||||
|     setup_gpu, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @app.command( | ||||
|  |  | |||
|  | @ -1,17 +1,18 @@ | |||
| from typing import Optional, Sequence, Union, Iterator | ||||
| import tqdm | ||||
| from pathlib import Path | ||||
| import srsly | ||||
| import cProfile | ||||
| import itertools | ||||
| import pstats | ||||
| import sys | ||||
| import itertools | ||||
| from wasabi import msg, Printer | ||||
| import typer | ||||
| from pathlib import Path | ||||
| from typing import Iterator, Optional, Sequence, Union | ||||
| 
 | ||||
| import srsly | ||||
| import tqdm | ||||
| import typer | ||||
| from wasabi import Printer, msg | ||||
| 
 | ||||
| from ._util import app, debug_cli, Arg, Opt, NAME | ||||
| from ..language import Language | ||||
| from ..util import load_model | ||||
| from ._util import NAME, Arg, Opt, app, debug_cli | ||||
| 
 | ||||
| 
 | ||||
| @debug_cli.command("profile") | ||||
|  |  | |||
|  | @ -1,16 +1,27 @@ | |||
| from typing import Any, Dict, Optional | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import os | ||||
| import re | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Optional | ||||
| 
 | ||||
| import requests | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ...util import ensure_path, working_dir | ||||
| from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config | ||||
| from .._util import get_checksum, download_file, git_checkout, get_git_version | ||||
| from .._util import SimpleFrozenDict, parse_config_overrides | ||||
| from .._util import ( | ||||
|     PROJECT_FILE, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     SimpleFrozenDict, | ||||
|     download_file, | ||||
|     get_checksum, | ||||
|     get_git_version, | ||||
|     git_checkout, | ||||
|     load_project_config, | ||||
|     parse_config_overrides, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| # Whether assets are extra if `extra` is not set. | ||||
| EXTRA_DEFAULT = False | ||||
|  |  | |||
|  | @ -1,13 +1,22 @@ | |||
| from typing import Optional | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import subprocess | ||||
| import re | ||||
| import subprocess | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ... import about | ||||
| from ...util import ensure_path | ||||
| from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE | ||||
| from .._util import git_checkout, get_git_version, git_repo_branch_exists | ||||
| from .._util import ( | ||||
|     COMMAND, | ||||
|     PROJECT_FILE, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     get_git_version, | ||||
|     git_checkout, | ||||
|     git_repo_branch_exists, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| DEFAULT_REPO = about.__projects__ | ||||
| DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ | ||||
|  |  | |||
|  | @ -1,9 +1,9 @@ | |||
| from pathlib import Path | ||||
| from wasabi import msg, MarkdownRenderer | ||||
| 
 | ||||
| from wasabi import MarkdownRenderer, msg | ||||
| 
 | ||||
| from ...util import working_dir | ||||
| from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config | ||||
| 
 | ||||
| from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli | ||||
| 
 | ||||
| DOCS_URL = "https://spacy.io" | ||||
| INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the | ||||
|  |  | |||
|  | @ -1,15 +1,28 @@ | |||
| """This module contains helpers and subcommands for integrating spaCy projects | ||||
| with Data Version Controk (DVC). https://dvc.org""" | ||||
| from typing import Dict, Any, List, Optional, Iterable | ||||
| import subprocess | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Iterable, List, Optional | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli | ||||
| from .._util import Arg, Opt, NAME, COMMAND | ||||
| from ...util import working_dir, split_command, join_command, run_command | ||||
| from ...util import SimpleFrozenList | ||||
| 
 | ||||
| from ...util import ( | ||||
|     SimpleFrozenList, | ||||
|     join_command, | ||||
|     run_command, | ||||
|     split_command, | ||||
|     working_dir, | ||||
| ) | ||||
| from .._util import ( | ||||
|     COMMAND, | ||||
|     NAME, | ||||
|     PROJECT_FILE, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     get_hash, | ||||
|     load_project_config, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| DVC_CONFIG = "dvc.yaml" | ||||
| DVC_DIR = ".dvc" | ||||
|  |  | |||
|  | @ -1,9 +1,9 @@ | |||
| from pathlib import Path | ||||
| 
 | ||||
| from wasabi import msg | ||||
| from .remote_storage import RemoteStorage | ||||
| from .remote_storage import get_command_hash | ||||
| from .._util import project_cli, Arg, logger | ||||
| from .._util import load_project_config | ||||
| 
 | ||||
| from .._util import Arg, load_project_config, logger, project_cli | ||||
| from .remote_storage import RemoteStorage, get_command_hash | ||||
| from .run import update_lockfile | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,9 +1,9 @@ | |||
| from pathlib import Path | ||||
| 
 | ||||
| from wasabi import msg | ||||
| from .remote_storage import RemoteStorage | ||||
| from .remote_storage import get_content_hash, get_command_hash | ||||
| from .._util import load_project_config | ||||
| from .._util import project_cli, Arg, logger | ||||
| 
 | ||||
| from .._util import Arg, load_project_config, logger, project_cli | ||||
| from .remote_storage import RemoteStorage, get_command_hash, get_content_hash | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("push") | ||||
|  |  | |||
|  | @ -1,18 +1,25 @@ | |||
| from typing import Optional, List, Dict, TYPE_CHECKING | ||||
| import hashlib | ||||
| import os | ||||
| import site | ||||
| import hashlib | ||||
| import urllib.parse | ||||
| import tarfile | ||||
| import urllib.parse | ||||
| from pathlib import Path | ||||
| from typing import TYPE_CHECKING, Dict, List, Optional | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .._util import get_hash, get_checksum, upload_file, download_file | ||||
| from .._util import ensure_pathy, make_tempdir | ||||
| from ...util import get_minor_version, ENV_VARS, check_bool_env_var | ||||
| from ...git_info import GIT_VERSION | ||||
| from ... import about | ||||
| from ...errors import Errors | ||||
| from ...git_info import GIT_VERSION | ||||
| from ...util import ENV_VARS, check_bool_env_var, get_minor_version | ||||
| from .._util import ( | ||||
|     download_file, | ||||
|     ensure_pathy, | ||||
|     get_checksum, | ||||
|     get_hash, | ||||
|     make_tempdir, | ||||
|     upload_file, | ||||
| ) | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from pathy import FluidPath  # noqa: F401 | ||||
|  |  | |||
|  | @ -1,20 +1,39 @@ | |||
| from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple | ||||
| import os.path | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from wasabi import msg | ||||
| from wasabi.util import locale_escape | ||||
| import sys | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple | ||||
| 
 | ||||
| import srsly | ||||
| import typer | ||||
| from wasabi import msg | ||||
| from wasabi.util import locale_escape | ||||
| 
 | ||||
| from ... import about | ||||
| from ...git_info import GIT_VERSION | ||||
| from ...util import working_dir, run_command, split_command, is_cwd, join_command | ||||
| from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS | ||||
| from ...util import check_bool_env_var, SimpleFrozenDict | ||||
| from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash | ||||
| from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides | ||||
| from ...util import ( | ||||
|     ENV_VARS, | ||||
|     SimpleFrozenDict, | ||||
|     SimpleFrozenList, | ||||
|     check_bool_env_var, | ||||
|     is_cwd, | ||||
|     is_minor_version_match, | ||||
|     join_command, | ||||
|     run_command, | ||||
|     split_command, | ||||
|     working_dir, | ||||
| ) | ||||
| from .._util import ( | ||||
|     COMMAND, | ||||
|     PROJECT_FILE, | ||||
|     PROJECT_LOCK, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     get_checksum, | ||||
|     get_hash, | ||||
|     load_project_config, | ||||
|     parse_config_overrides, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command( | ||||
|  |  | |||
|  | @ -1,15 +1,23 @@ | |||
| from typing import Optional, Dict, Any, Union | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import typer | ||||
| import logging | ||||
| import sys | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Optional, Union | ||||
| 
 | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error | ||||
| from ._util import import_code, setup_gpu | ||||
| from ..training.loop import train as train_nlp | ||||
| from ..training.initialize import init_nlp | ||||
| from .. import util | ||||
| from ..training.initialize import init_nlp | ||||
| from ..training.loop import train as train_nlp | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     app, | ||||
|     import_code, | ||||
|     parse_config_overrides, | ||||
|     setup_gpu, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @app.command( | ||||
|  |  | |||
|  | @ -1,14 +1,21 @@ | |||
| from typing import Tuple | ||||
| from pathlib import Path | ||||
| import sys | ||||
| import requests | ||||
| from wasabi import msg, Printer | ||||
| import warnings | ||||
| from pathlib import Path | ||||
| from typing import Tuple | ||||
| 
 | ||||
| import requests | ||||
| from wasabi import Printer, msg | ||||
| 
 | ||||
| from ._util import app | ||||
| from .. import about | ||||
| from ..util import get_package_version, get_installed_models, get_minor_version | ||||
| from ..util import get_package_path, get_model_meta, is_compatible_version | ||||
| from ..util import ( | ||||
|     get_installed_models, | ||||
|     get_minor_version, | ||||
|     get_model_meta, | ||||
|     get_package_path, | ||||
|     get_package_version, | ||||
|     is_compatible_version, | ||||
| ) | ||||
| from ._util import app | ||||
| 
 | ||||
| 
 | ||||
| @app.command("validate") | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| """Helpers for Python and platform compatibility.""" | ||||
| import sys | ||||
| 
 | ||||
| from thinc.util import copy_array | ||||
| 
 | ||||
| try: | ||||
|  |  | |||
|  | @ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities. | |||
| DOCS: https://spacy.io/api/top-level#displacy | ||||
| USAGE: https://spacy.io/usage/visualizers | ||||
| """ | ||||
| from typing import Union, Iterable, Optional, Dict, Any, Callable | ||||
| import warnings | ||||
| from typing import Any, Callable, Dict, Iterable, Optional, Union | ||||
| 
 | ||||
| from .render import DependencyRenderer, EntityRenderer, SpanRenderer | ||||
| from ..tokens import Doc, Span | ||||
| from ..errors import Errors, Warnings | ||||
| from ..util import is_in_jupyter | ||||
| from ..util import find_available_port | ||||
| 
 | ||||
| from ..tokens import Doc, Span | ||||
| from ..util import find_available_port, is_in_jupyter | ||||
| from .render import DependencyRenderer, EntityRenderer, SpanRenderer | ||||
| 
 | ||||
| _html = {} | ||||
| RENDER_WRAPPER = None | ||||
|  | @ -68,7 +66,7 @@ def render( | |||
|     if jupyter or (jupyter is None and is_in_jupyter()): | ||||
|         # return HTML rendered by IPython display() | ||||
|         # See #4840 for details on span wrapper to disable mathjax | ||||
|         from IPython.core.display import display, HTML | ||||
|         from IPython.core.display import HTML, display | ||||
| 
 | ||||
|         return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html))) | ||||
|     return html | ||||
|  |  | |||
|  | @ -1,15 +1,29 @@ | |||
| from typing import Any, Dict, List, Optional, Tuple, Union | ||||
| import uuid | ||||
| import itertools | ||||
| import uuid | ||||
| from typing import Any, Dict, List, Optional, Tuple, Union | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| from ..util import escape_html, minify_html, registry | ||||
| from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS | ||||
| from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS | ||||
| from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN | ||||
| from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL | ||||
| from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS | ||||
| from .templates import TPL_TITLE | ||||
| from .templates import ( | ||||
|     TPL_DEP_ARCS, | ||||
|     TPL_DEP_SVG, | ||||
|     TPL_DEP_WORDS, | ||||
|     TPL_DEP_WORDS_LEMMA, | ||||
|     TPL_ENT, | ||||
|     TPL_ENT_RTL, | ||||
|     TPL_ENTS, | ||||
|     TPL_FIGURE, | ||||
|     TPL_KB_LINK, | ||||
|     TPL_PAGE, | ||||
|     TPL_SPAN, | ||||
|     TPL_SPAN_RTL, | ||||
|     TPL_SPAN_SLICE, | ||||
|     TPL_SPAN_SLICE_RTL, | ||||
|     TPL_SPAN_START, | ||||
|     TPL_SPAN_START_RTL, | ||||
|     TPL_SPANS, | ||||
|     TPL_TITLE, | ||||
| ) | ||||
| 
 | ||||
| DEFAULT_LANG = "en" | ||||
| DEFAULT_DIR = "ltr" | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from typing import Literal | ||||
| import warnings | ||||
| from typing import Literal | ||||
| 
 | ||||
| 
 | ||||
| class ErrorsWithCodes(type): | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| import warnings | ||||
| 
 | ||||
| from .errors import Warnings | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from .candidate import Candidate, InMemoryCandidate | ||||
| from .kb import KnowledgeBase | ||||
| from .kb_in_memory import InMemoryLookupKB | ||||
| from .candidate import Candidate, InMemoryCandidate | ||||
| 
 | ||||
| __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] | ||||
|  |  | |||
|  | @ -1,6 +1,8 @@ | |||
| from libcpp.vector cimport vector | ||||
| from .kb_in_memory cimport InMemoryLookupKB | ||||
| 
 | ||||
| from ..typedefs cimport hash_t | ||||
| from .kb_in_memory cimport InMemoryLookupKB | ||||
| 
 | ||||
| 
 | ||||
| cdef class Candidate: | ||||
|     pass | ||||
|  | @ -9,7 +11,7 @@ cdef class Candidate: | |||
| cdef class InMemoryCandidate(Candidate): | ||||
|     cdef readonly hash_t _entity_hash | ||||
|     cdef readonly hash_t _alias_hash | ||||
|     cpdef vector[float] _entity_vector | ||||
|     cdef vector[float] _entity_vector | ||||
|     cdef float _prior_prob | ||||
|     cdef readonly InMemoryLookupKB _kb | ||||
|     cdef float _entity_freq | ||||
|  |  | |||
|  | @ -1,8 +1,10 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| 
 | ||||
| from .kb_in_memory cimport InMemoryLookupKB | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| cdef class Candidate: | ||||
|     """A `Candidate` object refers to a textual mention that may or may not be resolved | ||||
|     to a specific entity from a Knowledge Base. This will be used as input for the entity linking | ||||
|  |  | |||
|  | @ -2,8 +2,10 @@ | |||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
| from libc.stdint cimport int64_t | ||||
| 
 | ||||
| from ..vocab cimport Vocab | ||||
| 
 | ||||
| 
 | ||||
| cdef class KnowledgeBase: | ||||
|     cdef Pool mem | ||||
|     cdef readonly Vocab vocab | ||||
|  |  | |||
|  | @ -2,12 +2,13 @@ | |||
| 
 | ||||
| from pathlib import Path | ||||
| from typing import Iterable, Tuple, Union | ||||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
| 
 | ||||
| from .candidate import Candidate | ||||
| from ..errors import Errors | ||||
| from ..tokens import Span, SpanGroup | ||||
| from ..util import SimpleFrozenList | ||||
| from ..errors import Errors | ||||
| from .candidate import Candidate | ||||
| 
 | ||||
| 
 | ||||
| cdef class KnowledgeBase: | ||||
|  |  | |||
|  | @ -1,11 +1,11 @@ | |||
| """Knowledge-base for entity or concept linking.""" | ||||
| from preshed.maps cimport PreshMap | ||||
| from libcpp.vector cimport vector | ||||
| from libc.stdint cimport int32_t, int64_t | ||||
| from libc.stdio cimport FILE | ||||
| from libcpp.vector cimport vector | ||||
| from preshed.maps cimport PreshMap | ||||
| 
 | ||||
| from ..structs cimport AliasC, KBEntryC | ||||
| from ..typedefs cimport hash_t | ||||
| from ..structs cimport KBEntryC, AliasC | ||||
| from .kb cimport KnowledgeBase | ||||
| 
 | ||||
| ctypedef vector[KBEntryC] entry_vec | ||||
|  |  | |||
|  | @ -1,23 +1,28 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| from typing import Iterable, Callable, Dict, Any, Union | ||||
| from typing import Any, Callable, Dict, Iterable, Union | ||||
| 
 | ||||
| import srsly | ||||
| from preshed.maps cimport PreshMap | ||||
| from cpython.exc cimport PyErr_SetFromErrno | ||||
| from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek | ||||
| from libc.stdint cimport int32_t, int64_t | ||||
| from libcpp.vector cimport vector | ||||
| 
 | ||||
| from pathlib import Path | ||||
| from cpython.exc cimport PyErr_SetFromErrno | ||||
| from libc.stdint cimport int32_t, int64_t | ||||
| from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite | ||||
| from libcpp.vector cimport vector | ||||
| from preshed.maps cimport PreshMap | ||||
| 
 | ||||
| import warnings | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from ..tokens import Span | ||||
| 
 | ||||
| from ..typedefs cimport hash_t | ||||
| from ..errors import Errors, Warnings | ||||
| 
 | ||||
| from .. import util | ||||
| from ..errors import Errors, Warnings | ||||
| from ..util import SimpleFrozenList, ensure_path | ||||
| 
 | ||||
| from ..vocab cimport Vocab | ||||
| from .kb cimport KnowledgeBase | ||||
| 
 | ||||
| from .candidate import InMemoryCandidate | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from ...language import BaseDefaults, Language | ||||
| from .stop_words import STOP_WORDS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| 
 | ||||
| class AfrikaansDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,12 +1,11 @@ | |||
| from .stop_words import STOP_WORDS | ||||
| from ...attrs import LANG | ||||
| from ...language import BaseDefaults, Language | ||||
| from ...util import update_exc | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_SUFFIXES | ||||
| 
 | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...language import Language, BaseDefaults | ||||
| from ...attrs import LANG | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| 
 | ||||
| class AmharicDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,5 +1,11 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY | ||||
| from ..char_classes import UNITS, ALPHA_UPPER | ||||
| from ..char_classes import ( | ||||
|     ALPHA_UPPER, | ||||
|     CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     UNITS, | ||||
| ) | ||||
| 
 | ||||
| _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split() | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| from ...symbols import ORTH, NORM | ||||
| 
 | ||||
| from ...symbols import NORM, ORTH | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,8 +1,8 @@ | |||
| from .stop_words import STOP_WORDS | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| 
 | ||||
| class ArabicDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,5 +1,11 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY | ||||
| from ..char_classes import UNITS, ALPHA_UPPER | ||||
| from ..char_classes import ( | ||||
|     ALPHA_UPPER, | ||||
|     CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     UNITS, | ||||
| ) | ||||
| 
 | ||||
| _suffixes = ( | ||||
|     LIST_PUNCT | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from .stop_words import STOP_WORDS | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from ...language import Language, BaseDefaults | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class AzerbaijaniDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| # Eleven, twelve etc. are written separate: on bir, on iki | ||||
| 
 | ||||
| _num_words = [ | ||||
|  |  | |||
|  | @ -1,12 +1,14 @@ | |||
| from ...attrs import LANG | ||||
| from ...language import BaseDefaults, Language | ||||
| from ...util import update_exc | ||||
| from ..punctuation import ( | ||||
|     COMBINING_DIACRITICS_TOKENIZER_INFIXES, | ||||
|     COMBINING_DIACRITICS_TOKENIZER_SUFFIXES, | ||||
| ) | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES | ||||
| from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES | ||||
| from ...language import Language, BaseDefaults | ||||
| from ...attrs import LANG | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| 
 | ||||
| class BulgarianDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| _num_words = [ | ||||
|     "нула", | ||||
|     "едно", | ||||
|  |  | |||
|  | @ -4,8 +4,7 @@ References: | |||
|     (countries, occupations, fields of studies and more). | ||||
| """ | ||||
| 
 | ||||
| from ...symbols import ORTH, NORM | ||||
| 
 | ||||
| from ...symbols import NORM, ORTH | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,10 +1,12 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| from ...language import BaseDefaults, Language | ||||
| from ...pipeline import Lemmatizer | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class BengaliDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,14 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS | ||||
| from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS | ||||
| 
 | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     CONCAT_QUOTES, | ||||
|     HYPHENS, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     UNITS, | ||||
| ) | ||||
| 
 | ||||
| _currency = r"\$¢£€¥฿৳" | ||||
| _quotes = CONCAT_QUOTES.replace("'", "") | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,14 +1,14 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from ...language import Language, BaseDefaults | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lemmatizer import CatalanLemmatizer | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class CatalanDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| _num_words = [ | ||||
|     "zero", | ||||
|     "un", | ||||
|  |  | |||
|  | @ -1,9 +1,18 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS | ||||
| from ..char_classes import LIST_CURRENCY | ||||
| from ..char_classes import CURRENCY | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT | ||||
| from ..char_classes import merge_chars, _units | ||||
| 
 | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     CURRENCY, | ||||
|     LIST_CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     PUNCT, | ||||
|     _units, | ||||
|     merge_chars, | ||||
| ) | ||||
| 
 | ||||
| ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,7 +1,8 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from ...tokens import Doc, Span | ||||
| from ...symbols import NOUN, PROPN | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...errors import Errors | ||||
| from ...symbols import NOUN, PROPN | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
| def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from .stop_words import STOP_WORDS | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from ...language import Language, BaseDefaults | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class CzechDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,9 +1,9 @@ | |||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from ...language import Language, BaseDefaults | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class DanishDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| # Source http://fjern-uv.dk/tal.php | ||||
| _num_words = """nul | ||||
| en et to tre fire fem seks syv otte ni ti | ||||
|  |  | |||
|  | @ -1,8 +1,13 @@ | |||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
| ) | ||||
| from ..punctuation import TOKENIZER_SUFFIXES | ||||
| 
 | ||||
| 
 | ||||
| _quotes = CONCAT_QUOTES.replace("'", "") | ||||
| 
 | ||||
| _infixes = ( | ||||
|  |  | |||
|  | @ -1,7 +1,8 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from ...tokens import Doc, Span | ||||
| from ...symbols import NOUN, PROPN, PRON, VERB, AUX | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...errors import Errors | ||||
| from ...symbols import AUX, NOUN, PRON, PROPN, VERB | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
| def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||
|  |  | |||
|  | @ -2,10 +2,9 @@ | |||
| Tokenizer Exceptions. | ||||
| Source: https://forkortelse.dk/ and various others. | ||||
| """ | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,8 +1,8 @@ | |||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from ...language import BaseDefaults, Language | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from ...language import Language, BaseDefaults | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class GermanDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,9 +1,18 @@ | |||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES | ||||
| from ..char_classes import CURRENCY, UNITS, PUNCT | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     PUNCT, | ||||
|     UNITS, | ||||
| ) | ||||
| from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES | ||||
| 
 | ||||
| 
 | ||||
| _prefixes = ["``"] + BASE_TOKENIZER_PREFIXES | ||||
| 
 | ||||
| _suffixes = ( | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...symbols import NOUN, PROPN, PRON | ||||
| from ...errors import Errors | ||||
| from ...symbols import NOUN, PRON, PROPN | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = { | ||||
|     "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}], | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .stop_words import STOP_WORDS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| 
 | ||||
| class LowerSorbianDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,13 +1,14 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lemmatizer import GreekLemmatizer | ||||
| from ...language import Language, BaseDefaults | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class GreekDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| def get_pos_from_wiktionary(): | ||||
|     import re | ||||
| 
 | ||||
|     from gensim.corpora.wikicorpus import extract_pages | ||||
| 
 | ||||
|     regex = re.compile(r"==={{(\w+)\|el}}===") | ||||
|  |  | |||
|  | @ -1,6 +1,16 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY | ||||
| from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS | ||||
| from ..char_classes import CONCAT_QUOTES, CURRENCY | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     CURRENCY, | ||||
|     HYPHENS, | ||||
|     LIST_CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
| ) | ||||
| 
 | ||||
| _units = ( | ||||
|     "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft " | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...symbols import NOUN, PROPN, PRON | ||||
| from ...errors import Errors | ||||
| from ...symbols import NOUN, PRON, PROPN | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,13 +1,14 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .punctuation import TOKENIZER_INFIXES | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lemmatizer import EnglishLemmatizer | ||||
| from ...language import Language, BaseDefaults | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_INFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class EnglishDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,5 +1,12 @@ | |||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     HYPHENS, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
| ) | ||||
| 
 | ||||
| _infixes = ( | ||||
|     LIST_ELLIPSES | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...symbols import NOUN, PROPN, PRON | ||||
| from ...errors import Errors | ||||
| from ...symbols import NOUN, PRON, PROPN | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,8 +1,8 @@ | |||
| from typing import Dict, List | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc: Dict[str, List[Dict]] = {} | ||||
| _exclude = [ | ||||
|  |  | |||
|  | @ -1,12 +1,14 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| 
 | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lemmatizer import SpanishLemmatizer | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||
| from ...language import Language, BaseDefaults | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class SpanishDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from typing import List, Optional, Tuple | ||||
| import re | ||||
| from typing import List, Optional, Tuple | ||||
| 
 | ||||
| from ...pipeline import Lemmatizer | ||||
| from ...tokens import Token | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| _num_words = [ | ||||
|     "cero", | ||||
|     "uno", | ||||
|  |  | |||
|  | @ -1,8 +1,17 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES | ||||
| from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA | ||||
| from ..char_classes import merge_chars | ||||
| 
 | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     LIST_UNITS, | ||||
|     PUNCT, | ||||
|     merge_chars, | ||||
| ) | ||||
| 
 | ||||
| _list_units = [u for u in LIST_UNITS if u != "%"] | ||||
| _units = merge_chars(" ".join(_list_units)) | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...symbols import NOUN, PROPN, PRON | ||||
| from ...errors import Errors | ||||
| from ...symbols import NOUN, PRON, PROPN | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = { | ||||
|     "pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}], | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from ...language import BaseDefaults, Language | ||||
| from .stop_words import STOP_WORDS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| 
 | ||||
| class EstonianDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| from .stop_words import STOP_WORDS | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_SUFFIXES | ||||
| from ...language import Language, BaseDefaults | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class BasqueDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,4 +1,3 @@ | |||
| from ..punctuation import TOKENIZER_SUFFIXES | ||||
| 
 | ||||
| 
 | ||||
| _suffixes = TOKENIZER_SUFFIXES | ||||
|  |  | |||
|  | @ -1,12 +1,14 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_SUFFIXES | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| from ...language import BaseDefaults, Language | ||||
| from ...pipeline import Lemmatizer | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class PersianDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| MIM = "م" | ||||
| ZWNJ_O_MIM = "ام" | ||||
| YE_NUN = "ین" | ||||
|  |  | |||
|  | @ -1,5 +1,11 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY | ||||
| from ..char_classes import UNITS, ALPHA_UPPER | ||||
| from ..char_classes import ( | ||||
|     ALPHA_UPPER, | ||||
|     CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     UNITS, | ||||
| ) | ||||
| 
 | ||||
| _suffixes = ( | ||||
|     LIST_PUNCT | ||||
|  |  | |||
|  | @ -1,7 +1,8 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from ...tokens import Doc, Span | ||||
| from ...symbols import NOUN, PROPN, PRON | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...errors import Errors | ||||
| from ...symbols import NOUN, PRON, PROPN | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
| def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| from ...symbols import ORTH, NORM | ||||
| 
 | ||||
| from ...symbols import NORM, ORTH | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = { | ||||
|     ".ق ": [{ORTH: ".ق "}], | ||||
|  |  | |||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user