mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Merge branch 'master' into feat/add-pipe-instance
This commit is contained in:
commit
dcd8a765fd
4
.github/workflows/tests.yml
vendored
4
.github/workflows/tests.yml
vendored
|
@ -37,6 +37,10 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
python -m pip install black -c requirements.txt
|
python -m pip install black -c requirements.txt
|
||||||
python -m black spacy --check
|
python -m black spacy --check
|
||||||
|
- name: isort
|
||||||
|
run: |
|
||||||
|
python -m pip install isort -c requirements.txt
|
||||||
|
python -m isort spacy --check
|
||||||
- name: flake8
|
- name: flake8
|
||||||
run: |
|
run: |
|
||||||
python -m pip install flake8==5.0.4
|
python -m pip install flake8==5.0.4
|
||||||
|
|
|
@ -9,3 +9,6 @@ requires = [
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.isort]
|
||||||
|
profile = "black"
|
||||||
|
|
|
@ -38,3 +38,4 @@ types-setuptools>=57.0.0
|
||||||
types-requests
|
types-requests
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
black==22.3.0
|
black==22.3.0
|
||||||
|
isort>=5.0,<6.0
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Union, Iterable, Dict, Any
|
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, Union
|
||||||
|
|
||||||
# set library-specific custom warning handling before doing anything else
|
# set library-specific custom warning handling before doing anything else
|
||||||
from .errors import setup_default_warnings
|
from .errors import setup_default_warnings
|
||||||
|
@ -8,20 +8,17 @@ from .errors import setup_default_warnings
|
||||||
setup_default_warnings() # noqa: E402
|
setup_default_warnings() # noqa: E402
|
||||||
|
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from . import pipeline # noqa: F401
|
from . import pipeline # noqa: F401
|
||||||
from .cli.info import info # noqa: F401
|
|
||||||
from .glossary import explain # noqa: F401
|
|
||||||
from .about import __version__ # noqa: F401
|
|
||||||
from .util import registry, logger # noqa: F401
|
|
||||||
|
|
||||||
from .errors import Errors
|
|
||||||
from .language import Language
|
|
||||||
from .vocab import Vocab
|
|
||||||
from . import util
|
from . import util
|
||||||
|
from .about import __version__ # noqa: F401
|
||||||
|
from .cli.info import info # noqa: F401
|
||||||
|
from .errors import Errors
|
||||||
|
from .glossary import explain # noqa: F401
|
||||||
|
from .language import Language
|
||||||
|
from .util import logger, registry # noqa: F401
|
||||||
|
from .vocab import Vocab
|
||||||
|
|
||||||
if sys.maxunicode == 65535:
|
if sys.maxunicode == 65535:
|
||||||
raise SystemError(Errors.E130)
|
raise SystemError(Errors.E130)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.6.0.dev1"
|
__version__ = "3.6.0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# Reserve 64 values for flag features
|
# Reserve 64 values for flag features
|
||||||
from . cimport symbols
|
from . cimport symbols
|
||||||
|
|
||||||
|
|
||||||
cdef enum attr_id_t:
|
cdef enum attr_id_t:
|
||||||
NULL_ATTR
|
NULL_ATTR
|
||||||
IS_ALPHA
|
IS_ALPHA
|
||||||
|
|
|
@ -1,35 +1,35 @@
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, setup_cli # noqa: F401
|
from ._util import app, setup_cli # noqa: F401
|
||||||
|
from .apply import apply # noqa: F401
|
||||||
|
from .assemble import assemble_cli # noqa: F401
|
||||||
|
|
||||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||||
# are registered automatically and won't have to be imported here.
|
# are registered automatically and won't have to be imported here.
|
||||||
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
||||||
from .download import download # noqa: F401
|
|
||||||
from .info import info # noqa: F401
|
|
||||||
from .package import package # noqa: F401
|
|
||||||
from .profile import profile # noqa: F401
|
|
||||||
from .train import train_cli # noqa: F401
|
|
||||||
from .assemble import assemble_cli # noqa: F401
|
|
||||||
from .pretrain import pretrain # noqa: F401
|
|
||||||
from .debug_data import debug_data # noqa: F401
|
|
||||||
from .debug_config import debug_config # noqa: F401
|
|
||||||
from .debug_model import debug_model # noqa: F401
|
|
||||||
from .debug_diff import debug_diff # noqa: F401
|
|
||||||
from .evaluate import evaluate # noqa: F401
|
|
||||||
from .apply import apply # noqa: F401
|
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
from .debug_config import debug_config # noqa: F401
|
||||||
from .init_config import init_config, fill_config # noqa: F401
|
from .debug_data import debug_data # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .debug_diff import debug_diff # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .project.assets import project_assets # noqa: F401
|
from .download import download # noqa: F401
|
||||||
from .project.run import project_run # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .project.dvc import project_update_dvc # noqa: F401
|
|
||||||
from .project.push import project_push # noqa: F401
|
|
||||||
from .project.pull import project_pull # noqa: F401
|
|
||||||
from .project.document import project_document # noqa: F401
|
|
||||||
from .find_threshold import find_threshold # noqa: F401
|
from .find_threshold import find_threshold # noqa: F401
|
||||||
|
from .info import info # noqa: F401
|
||||||
|
from .init_config import fill_config, init_config # noqa: F401
|
||||||
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
|
from .package import package # noqa: F401
|
||||||
|
from .pretrain import pretrain # noqa: F401
|
||||||
|
from .profile import profile # noqa: F401
|
||||||
|
from .project.assets import project_assets # noqa: F401
|
||||||
|
from .project.clone import project_clone # noqa: F401
|
||||||
|
from .project.document import project_document # noqa: F401
|
||||||
|
from .project.dvc import project_update_dvc # noqa: F401
|
||||||
|
from .project.pull import project_pull # noqa: F401
|
||||||
|
from .project.push import project_push # noqa: F401
|
||||||
|
from .project.run import project_run # noqa: F401
|
||||||
|
from .train import train_cli # noqa: F401
|
||||||
|
from .validate import validate # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -1,26 +1,44 @@
|
||||||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
|
|
||||||
from typing import TYPE_CHECKING, overload
|
|
||||||
import sys
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg, Printer
|
|
||||||
import srsly
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from configparser import InterpolationError
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Any,
|
||||||
|
Dict,
|
||||||
|
Iterable,
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Tuple,
|
||||||
|
Union,
|
||||||
|
overload,
|
||||||
|
)
|
||||||
|
|
||||||
|
import srsly
|
||||||
import typer
|
import typer
|
||||||
from click import NoSuchOption
|
from click import NoSuchOption
|
||||||
from click.parser import split_arg_string
|
from click.parser import split_arg_string
|
||||||
from typer.main import get_command
|
|
||||||
from contextlib import contextmanager
|
|
||||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||||
from thinc.util import gpu_is_available
|
from thinc.util import gpu_is_available
|
||||||
from configparser import InterpolationError
|
from typer.main import get_command
|
||||||
import os
|
from wasabi import Printer, msg
|
||||||
|
|
||||||
|
from .. import about
|
||||||
from ..compat import Literal
|
from ..compat import Literal
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import (
|
||||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
ENV_VARS,
|
||||||
from .. import about
|
SimpleFrozenDict,
|
||||||
|
import_file,
|
||||||
|
is_compatible_version,
|
||||||
|
logger,
|
||||||
|
make_tempdir,
|
||||||
|
registry,
|
||||||
|
run_command,
|
||||||
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import FluidPath # noqa: F401
|
from pathy import FluidPath # noqa: F401
|
||||||
|
|
|
@ -1,18 +1,15 @@
|
||||||
import tqdm
|
|
||||||
import srsly
|
|
||||||
|
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List, Iterable, cast, Union
|
from typing import Iterable, List, Optional, Union, cast
|
||||||
|
|
||||||
|
import srsly
|
||||||
|
import tqdm
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
|
||||||
|
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..vocab import Vocab
|
|
||||||
from ..util import ensure_path, load_model
|
from ..util import ensure_path, load_model
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
|
||||||
|
|
||||||
path_help = """Location of the documents to predict on.
|
path_help = """Location of the documents to predict on.
|
||||||
Can be a single file in .spacy format or a .jsonl file.
|
Can be a single file in .spacy format or a .jsonl file.
|
||||||
|
|
|
@ -1,13 +1,20 @@
|
||||||
from typing import Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import typer
|
|
||||||
import logging
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
|
||||||
from ._util import import_code
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import get_sourced_components, load_model_from_config
|
from ..util import get_sourced_components, load_model_from_config
|
||||||
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
app,
|
||||||
|
import_code,
|
||||||
|
parse_config_overrides,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
from typing import Iterable, List, Optional
|
|
||||||
import random
|
import random
|
||||||
from itertools import islice
|
|
||||||
import numpy
|
|
||||||
from pathlib import Path
|
|
||||||
import time
|
import time
|
||||||
from tqdm import tqdm
|
from itertools import islice
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional
|
||||||
|
|
||||||
|
import numpy
|
||||||
import typer
|
import typer
|
||||||
|
from tqdm import tqdm
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
|
@ -1,18 +1,22 @@
|
||||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
import itertools
|
||||||
from enum import Enum
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import Printer
|
|
||||||
import srsly
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Callable, Iterable, Mapping, Optional, Union
|
||||||
|
|
||||||
|
import srsly
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, walk_directory
|
|
||||||
from ..training import docs_to_json
|
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
from ..training import docs_to_json
|
||||||
from ..training.converters import conllu_to_docs
|
from ..training.converters import (
|
||||||
|
conll_ner_to_docs,
|
||||||
|
conllu_to_docs,
|
||||||
|
iob_to_docs,
|
||||||
|
json_to_docs,
|
||||||
|
)
|
||||||
|
from ._util import Arg, Opt, app, walk_directory
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
# matched by file extension and content. To add a converter, add a new
|
# matched by file extension and content. To add a converter, add a new
|
||||||
|
|
|
@ -1,15 +1,22 @@
|
||||||
from typing import Optional, Dict, Any, Union, List
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, table
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
|
import typer
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
from thinc.config import VARIABLE_RE
|
from thinc.config import VARIABLE_RE
|
||||||
import typer
|
from wasabi import msg, table
|
||||||
|
|
||||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
from .. import util
|
||||||
from ._util import import_code, debug_cli
|
|
||||||
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
|
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .. import util
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
debug_cli,
|
||||||
|
import_code,
|
||||||
|
parse_config_overrides,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
|
|
@ -1,31 +1,49 @@
|
||||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
|
|
||||||
from typing import cast, overload
|
|
||||||
from pathlib import Path
|
|
||||||
from collections import Counter
|
|
||||||
import sys
|
|
||||||
import srsly
|
|
||||||
from wasabi import Printer, MESSAGES, msg
|
|
||||||
import typer
|
|
||||||
import math
|
import math
|
||||||
import numpy
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import (
|
||||||
|
Any,
|
||||||
|
Dict,
|
||||||
|
Iterable,
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Sequence,
|
||||||
|
Set,
|
||||||
|
Tuple,
|
||||||
|
Union,
|
||||||
|
cast,
|
||||||
|
overload,
|
||||||
|
)
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
import numpy
|
||||||
from ._util import import_code, debug_cli, _format_number
|
import srsly
|
||||||
from ..training import Example, remove_bilu_prefix
|
import typer
|
||||||
from ..training.initialize import get_sourced_components
|
from wasabi import MESSAGES, Printer, msg
|
||||||
from ..schemas import ConfigSchemaTraining
|
|
||||||
from ..pipeline import TrainablePipe
|
from .. import util
|
||||||
|
from ..compat import Literal
|
||||||
|
from ..language import Language
|
||||||
|
from ..morphology import Morphology
|
||||||
|
from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
|
||||||
|
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||||
from ..pipeline import Morphologizer, SpanCategorizer
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
from ..training import Example, remove_bilu_prefix
|
||||||
from ..morphology import Morphology
|
from ..training.initialize import get_sourced_components
|
||||||
from ..language import Language
|
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
from ..compat import Literal
|
|
||||||
from ..vectors import Mode as VectorsMode
|
from ..vectors import Mode as VectorsMode
|
||||||
from .. import util
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
_format_number,
|
||||||
|
app,
|
||||||
|
debug_cli,
|
||||||
|
import_code,
|
||||||
|
parse_config_overrides,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
# Minimum number of expected occurrences of NER label in data to train new label
|
# Minimum number of expected occurrences of NER label in data to train new label
|
||||||
NEW_LABEL_THRESHOLD = 50
|
NEW_LABEL_THRESHOLD = 50
|
||||||
|
@ -212,7 +230,7 @@ def debug_data(
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the package")
|
msg.info("No word vectors present in the package")
|
||||||
|
|
||||||
if "spancat" in factory_names:
|
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||||
model_labels_spancat = _get_labels_from_spancat(nlp)
|
model_labels_spancat = _get_labels_from_spancat(nlp)
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
has_no_neg_warning = False
|
has_no_neg_warning = False
|
||||||
|
@ -830,7 +848,7 @@ def _compile_gold(
|
||||||
data["boundary_cross_ents"] += 1
|
data["boundary_cross_ents"] += 1
|
||||||
elif label == "-":
|
elif label == "-":
|
||||||
data["ner"]["-"] += 1
|
data["ner"]["-"] += 1
|
||||||
if "spancat" in factory_names:
|
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||||
for spans_key in list(eg.reference.spans.keys()):
|
for spans_key in list(eg.reference.spans.keys()):
|
||||||
# Obtain the span frequency
|
# Obtain the span frequency
|
||||||
if spans_key not in data["spancat"]:
|
if spans_key not in data["spancat"]:
|
||||||
|
@ -1028,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
|
||||||
pipe_names = [
|
pipe_names = [
|
||||||
pipe_name
|
pipe_name
|
||||||
for pipe_name in nlp.pipe_names
|
for pipe_name in nlp.pipe_names
|
||||||
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
|
if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
|
||||||
]
|
]
|
||||||
labels: Dict[str, Set[str]] = {}
|
labels: Dict[str, Set[str]] = {}
|
||||||
for pipe_name in pipe_names:
|
for pipe_name in pipe_names:
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
from wasabi import Printer, diff_strings, MarkdownRenderer
|
|
||||||
from pathlib import Path
|
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
from wasabi import MarkdownRenderer, Printer, diff_strings
|
||||||
|
|
||||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
|
||||||
from ..util import load_config
|
from ..util import load_config
|
||||||
from .init_config import init_config, Optimizations
|
from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
|
||||||
|
from .init_config import Optimizations, init_config
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
|
|
@ -1,19 +1,32 @@
|
||||||
from typing import Dict, Any, Optional
|
|
||||||
from pathlib import Path
|
|
||||||
import itertools
|
import itertools
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from thinc.api import (
|
||||||
|
Model,
|
||||||
|
data_validation,
|
||||||
|
fix_random_seed,
|
||||||
|
set_dropout_rate,
|
||||||
|
set_gpu_allocator,
|
||||||
|
)
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import resolve_dot_names
|
from spacy.util import resolve_dot_names
|
||||||
from wasabi import msg
|
|
||||||
from thinc.api import fix_random_seed, set_dropout_rate
|
|
||||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
|
||||||
import typer
|
|
||||||
|
|
||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
from .. import util
|
||||||
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .. import util
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
debug_cli,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
string_to_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
from typing import Optional, Sequence
|
|
||||||
import requests
|
|
||||||
import sys
|
import sys
|
||||||
from wasabi import msg
|
from typing import Optional, Sequence
|
||||||
import typer
|
|
||||||
|
import requests
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import is_package, get_minor_version, run_command
|
|
||||||
from ..util import is_prerelease_version
|
|
||||||
from ..errors import OLD_MODEL_SHORTCUTS
|
from ..errors import OLD_MODEL_SHORTCUTS
|
||||||
|
from ..util import get_minor_version, is_package, is_prerelease_version, run_command
|
||||||
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
|
|
@ -1,16 +1,16 @@
|
||||||
from typing import Optional, List, Dict, Any, Union
|
|
||||||
from wasabi import Printer
|
|
||||||
from pathlib import Path
|
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import fix_random_seed
|
from thinc.api import fix_random_seed
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ..training import Corpus
|
from .. import displacy, util
|
||||||
from ..tokens import Doc
|
|
||||||
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
|
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from ..tokens import Doc
|
||||||
from .. import displacy
|
from ..training import Corpus
|
||||||
|
from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
@benchmark_cli.command(
|
@benchmark_cli.command(
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
import functools
|
import functools
|
||||||
|
import logging
|
||||||
import operator
|
import operator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import logging
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
from typing import Optional, Tuple, Any, Dict, List
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import wasabi.tables
|
import wasabi.tables
|
||||||
|
|
||||||
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
|
|
||||||
from ..errors import Errors
|
|
||||||
from ..training import Corpus
|
|
||||||
from ._util import app, Arg, Opt, import_code, setup_gpu
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..errors import Errors
|
||||||
|
from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
|
||||||
|
from ..training import Corpus
|
||||||
|
from ._util import Arg, Opt, app, import_code, setup_gpu
|
||||||
|
|
||||||
_DEFAULTS = {
|
_DEFAULTS = {
|
||||||
"n_trials": 11,
|
"n_trials": 11,
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
from typing import Optional, Dict, Any, Union, List
|
|
||||||
import platform
|
|
||||||
import json
|
import json
|
||||||
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, MarkdownRenderer
|
from typing import Any, Dict, List, Optional, Union
|
||||||
import srsly
|
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, string_to_list
|
import srsly
|
||||||
from .download import get_model_filename, get_latest_version
|
from wasabi import MarkdownRenderer, Printer
|
||||||
from .. import util
|
|
||||||
from .. import about
|
from .. import about, util
|
||||||
from ..compat import importlib_metadata
|
from ..compat import importlib_metadata
|
||||||
|
from ._util import Arg, Opt, app, string_to_list
|
||||||
|
from .download import get_latest_version, get_model_filename
|
||||||
|
|
||||||
|
|
||||||
@app.command("info")
|
@app.command("info")
|
||||||
|
|
|
@ -1,19 +1,26 @@
|
||||||
from typing import Optional, List, Tuple
|
import re
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, diff_strings
|
from typing import List, Optional, Tuple
|
||||||
from thinc.api import Config
|
|
||||||
import srsly
|
import srsly
|
||||||
import re
|
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
|
from thinc.api import Config
|
||||||
|
from wasabi import Printer, diff_strings
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||||
from ..schemas import RecommendationSchema
|
from ..schemas import RecommendationSchema
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
from ._util import (
|
||||||
from ._util import string_to_list, import_code
|
COMMAND,
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
import_code,
|
||||||
|
init_cli,
|
||||||
|
show_validation_error,
|
||||||
|
string_to_list,
|
||||||
|
)
|
||||||
|
|
||||||
ROOT = Path(__file__).parent / "templates"
|
ROOT = Path(__file__).parent / "templates"
|
||||||
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
||||||
|
|
|
@ -1,15 +1,23 @@
|
||||||
from typing import Optional
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from typing import Optional
|
||||||
import typer
|
|
||||||
import srsly
|
import srsly
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..training.initialize import init_nlp, convert_vectors
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
from ..training.initialize import convert_vectors, init_nlp
|
||||||
from ._util import import_code, setup_gpu
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
import_code,
|
||||||
|
init_cli,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("vectors")
|
@init_cli.command("vectors")
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
from typing import Optional, Union, Any, Dict, List, Tuple, cast
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
|
||||||
from thinc.api import Config
|
|
||||||
from collections import defaultdict
|
|
||||||
from catalogue import RegistryError
|
|
||||||
import srsly
|
|
||||||
import sys
|
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
import srsly
|
||||||
from ..schemas import validate, ModelMetaSchema
|
from catalogue import RegistryError
|
||||||
from .. import util
|
from thinc.api import Config
|
||||||
from .. import about
|
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
||||||
|
|
||||||
|
from .. import about, util
|
||||||
|
from ..schemas import ModelMetaSchema, validate
|
||||||
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
||||||
|
|
||||||
|
|
||||||
@app.command("package")
|
@app.command("package")
|
||||||
|
|
|
@ -1,13 +1,21 @@
|
||||||
from typing import Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import typer
|
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
|
||||||
from ._util import import_code, setup_gpu
|
|
||||||
from ..training.pretrain import pretrain
|
from ..training.pretrain import pretrain
|
||||||
from ..util import load_config
|
from ..util import load_config
|
||||||
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
app,
|
||||||
|
import_code,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
|
|
@ -1,17 +1,18 @@
|
||||||
from typing import Optional, Sequence, Union, Iterator
|
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
|
||||||
import srsly
|
|
||||||
import cProfile
|
import cProfile
|
||||||
|
import itertools
|
||||||
import pstats
|
import pstats
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
from pathlib import Path
|
||||||
from wasabi import msg, Printer
|
from typing import Iterator, Optional, Sequence, Union
|
||||||
import typer
|
|
||||||
|
import srsly
|
||||||
|
import tqdm
|
||||||
|
import typer
|
||||||
|
from wasabi import Printer, msg
|
||||||
|
|
||||||
from ._util import app, debug_cli, Arg, Opt, NAME
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import load_model
|
from ..util import load_model
|
||||||
|
from ._util import NAME, Arg, Opt, app, debug_cli
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command("profile")
|
@debug_cli.command("profile")
|
||||||
|
|
|
@ -1,16 +1,27 @@
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import typer
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ...util import ensure_path, working_dir
|
from ...util import ensure_path, working_dir
|
||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
from .._util import (
|
||||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
PROJECT_FILE,
|
||||||
from .._util import SimpleFrozenDict, parse_config_overrides
|
Arg,
|
||||||
|
Opt,
|
||||||
|
SimpleFrozenDict,
|
||||||
|
download_file,
|
||||||
|
get_checksum,
|
||||||
|
get_git_version,
|
||||||
|
git_checkout,
|
||||||
|
load_project_config,
|
||||||
|
parse_config_overrides,
|
||||||
|
project_cli,
|
||||||
|
)
|
||||||
|
|
||||||
# Whether assets are extra if `extra` is not set.
|
# Whether assets are extra if `extra` is not set.
|
||||||
EXTRA_DEFAULT = False
|
EXTRA_DEFAULT = False
|
||||||
|
|
|
@ -1,13 +1,22 @@
|
||||||
from typing import Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import subprocess
|
|
||||||
import re
|
import re
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ... import about
|
from ... import about
|
||||||
from ...util import ensure_path
|
from ...util import ensure_path
|
||||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
from .._util import (
|
||||||
from .._util import git_checkout, get_git_version, git_repo_branch_exists
|
COMMAND,
|
||||||
|
PROJECT_FILE,
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
get_git_version,
|
||||||
|
git_checkout,
|
||||||
|
git_repo_branch_exists,
|
||||||
|
project_cli,
|
||||||
|
)
|
||||||
|
|
||||||
DEFAULT_REPO = about.__projects__
|
DEFAULT_REPO = about.__projects__
|
||||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, MarkdownRenderer
|
|
||||||
|
from wasabi import MarkdownRenderer, msg
|
||||||
|
|
||||||
from ...util import working_dir
|
from ...util import working_dir
|
||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
|
||||||
|
|
||||||
|
|
||||||
DOCS_URL = "https://spacy.io"
|
DOCS_URL = "https://spacy.io"
|
||||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
|
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
|
||||||
|
|
|
@ -1,15 +1,28 @@
|
||||||
"""This module contains helpers and subcommands for integrating spaCy projects
|
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||||
with Data Version Controk (DVC). https://dvc.org"""
|
with Data Version Controk (DVC). https://dvc.org"""
|
||||||
from typing import Dict, Any, List, Optional, Iterable
|
|
||||||
import subprocess
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional
|
||||||
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
|
from ...util import (
|
||||||
from .._util import Arg, Opt, NAME, COMMAND
|
SimpleFrozenList,
|
||||||
from ...util import working_dir, split_command, join_command, run_command
|
join_command,
|
||||||
from ...util import SimpleFrozenList
|
run_command,
|
||||||
|
split_command,
|
||||||
|
working_dir,
|
||||||
|
)
|
||||||
|
from .._util import (
|
||||||
|
COMMAND,
|
||||||
|
NAME,
|
||||||
|
PROJECT_FILE,
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
get_hash,
|
||||||
|
load_project_config,
|
||||||
|
project_cli,
|
||||||
|
)
|
||||||
|
|
||||||
DVC_CONFIG = "dvc.yaml"
|
DVC_CONFIG = "dvc.yaml"
|
||||||
DVC_DIR = ".dvc"
|
DVC_DIR = ".dvc"
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from .remote_storage import RemoteStorage
|
|
||||||
from .remote_storage import get_command_hash
|
from .._util import Arg, load_project_config, logger, project_cli
|
||||||
from .._util import project_cli, Arg, logger
|
from .remote_storage import RemoteStorage, get_command_hash
|
||||||
from .._util import load_project_config
|
|
||||||
from .run import update_lockfile
|
from .run import update_lockfile
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from .remote_storage import RemoteStorage
|
|
||||||
from .remote_storage import get_content_hash, get_command_hash
|
from .._util import Arg, load_project_config, logger, project_cli
|
||||||
from .._util import load_project_config
|
from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
|
||||||
from .._util import project_cli, Arg, logger
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("push")
|
@project_cli.command("push")
|
||||||
|
|
|
@ -1,18 +1,25 @@
|
||||||
from typing import Optional, List, Dict, TYPE_CHECKING
|
import hashlib
|
||||||
import os
|
import os
|
||||||
import site
|
import site
|
||||||
import hashlib
|
|
||||||
import urllib.parse
|
|
||||||
import tarfile
|
import tarfile
|
||||||
|
import urllib.parse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
||||||
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from .._util import get_hash, get_checksum, upload_file, download_file
|
|
||||||
from .._util import ensure_pathy, make_tempdir
|
|
||||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
|
||||||
from ...git_info import GIT_VERSION
|
|
||||||
from ... import about
|
from ... import about
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...git_info import GIT_VERSION
|
||||||
|
from ...util import ENV_VARS, check_bool_env_var, get_minor_version
|
||||||
|
from .._util import (
|
||||||
|
download_file,
|
||||||
|
ensure_pathy,
|
||||||
|
get_checksum,
|
||||||
|
get_hash,
|
||||||
|
make_tempdir,
|
||||||
|
upload_file,
|
||||||
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import FluidPath # noqa: F401
|
from pathy import FluidPath # noqa: F401
|
||||||
|
|
|
@ -1,20 +1,39 @@
|
||||||
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
|
||||||
import os.path
|
import os.path
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from wasabi import msg
|
|
||||||
from wasabi.util import locale_escape
|
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
import typer
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
from wasabi.util import locale_escape
|
||||||
|
|
||||||
from ... import about
|
from ... import about
|
||||||
from ...git_info import GIT_VERSION
|
from ...git_info import GIT_VERSION
|
||||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
from ...util import (
|
||||||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
ENV_VARS,
|
||||||
from ...util import check_bool_env_var, SimpleFrozenDict
|
SimpleFrozenDict,
|
||||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
SimpleFrozenList,
|
||||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
|
check_bool_env_var,
|
||||||
|
is_cwd,
|
||||||
|
is_minor_version_match,
|
||||||
|
join_command,
|
||||||
|
run_command,
|
||||||
|
split_command,
|
||||||
|
working_dir,
|
||||||
|
)
|
||||||
|
from .._util import (
|
||||||
|
COMMAND,
|
||||||
|
PROJECT_FILE,
|
||||||
|
PROJECT_LOCK,
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
get_checksum,
|
||||||
|
get_hash,
|
||||||
|
load_project_config,
|
||||||
|
parse_config_overrides,
|
||||||
|
project_cli,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
@project_cli.command(
|
||||||
|
|
|
@ -1,15 +1,23 @@
|
||||||
from typing import Optional, Dict, Any, Union
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import typer
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional, Union
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
|
||||||
from ._util import import_code, setup_gpu
|
|
||||||
from ..training.loop import train as train_nlp
|
|
||||||
from ..training.initialize import init_nlp
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..training.initialize import init_nlp
|
||||||
|
from ..training.loop import train as train_nlp
|
||||||
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
app,
|
||||||
|
import_code,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
|
|
@ -1,14 +1,21 @@
|
||||||
from typing import Tuple
|
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
import sys
|
||||||
import requests
|
|
||||||
from wasabi import msg, Printer
|
|
||||||
import warnings
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from wasabi import Printer, msg
|
||||||
|
|
||||||
from ._util import app
|
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import get_package_version, get_installed_models, get_minor_version
|
from ..util import (
|
||||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
get_installed_models,
|
||||||
|
get_minor_version,
|
||||||
|
get_model_meta,
|
||||||
|
get_package_path,
|
||||||
|
get_package_version,
|
||||||
|
is_compatible_version,
|
||||||
|
)
|
||||||
|
from ._util import app
|
||||||
|
|
||||||
|
|
||||||
@app.command("validate")
|
@app.command("validate")
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
"""Helpers for Python and platform compatibility."""
|
"""Helpers for Python and platform compatibility."""
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from thinc.util import copy_array
|
from thinc.util import copy_array
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
|
||||||
DOCS: https://spacy.io/api/top-level#displacy
|
DOCS: https://spacy.io/api/top-level#displacy
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
|
||||||
import warnings
|
import warnings
|
||||||
|
from typing import Any, Callable, Dict, Iterable, Optional, Union
|
||||||
|
|
||||||
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
|
||||||
from ..tokens import Doc, Span
|
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import is_in_jupyter
|
from ..tokens import Doc, Span
|
||||||
from ..util import find_available_port
|
from ..util import find_available_port, is_in_jupyter
|
||||||
|
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||||
|
|
||||||
_html = {}
|
_html = {}
|
||||||
RENDER_WRAPPER = None
|
RENDER_WRAPPER = None
|
||||||
|
@ -68,7 +66,7 @@ def render(
|
||||||
if jupyter or (jupyter is None and is_in_jupyter()):
|
if jupyter or (jupyter is None and is_in_jupyter()):
|
||||||
# return HTML rendered by IPython display()
|
# return HTML rendered by IPython display()
|
||||||
# See #4840 for details on span wrapper to disable mathjax
|
# See #4840 for details on span wrapper to disable mathjax
|
||||||
from IPython.core.display import display, HTML
|
from IPython.core.display import HTML, display
|
||||||
|
|
||||||
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
|
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
|
||||||
return html
|
return html
|
||||||
|
|
|
@ -1,15 +1,29 @@
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
||||||
import uuid
|
|
||||||
import itertools
|
import itertools
|
||||||
|
import uuid
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..util import escape_html, minify_html, registry
|
from ..util import escape_html, minify_html, registry
|
||||||
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
|
from .templates import (
|
||||||
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
|
TPL_DEP_ARCS,
|
||||||
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
|
TPL_DEP_SVG,
|
||||||
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
|
TPL_DEP_WORDS,
|
||||||
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
|
TPL_DEP_WORDS_LEMMA,
|
||||||
from .templates import TPL_TITLE
|
TPL_ENT,
|
||||||
|
TPL_ENT_RTL,
|
||||||
|
TPL_ENTS,
|
||||||
|
TPL_FIGURE,
|
||||||
|
TPL_KB_LINK,
|
||||||
|
TPL_PAGE,
|
||||||
|
TPL_SPAN,
|
||||||
|
TPL_SPAN_RTL,
|
||||||
|
TPL_SPAN_SLICE,
|
||||||
|
TPL_SPAN_SLICE_RTL,
|
||||||
|
TPL_SPAN_START,
|
||||||
|
TPL_SPAN_START_RTL,
|
||||||
|
TPL_SPANS,
|
||||||
|
TPL_TITLE,
|
||||||
|
)
|
||||||
|
|
||||||
DEFAULT_LANG = "en"
|
DEFAULT_LANG = "en"
|
||||||
DEFAULT_DIR = "ltr"
|
DEFAULT_DIR = "ltr"
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .compat import Literal
|
from .compat import Literal
|
||||||
|
|
||||||
|
|
||||||
|
@ -743,8 +744,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
|
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
|
||||||
"load the model, use its full name instead:\n\n"
|
"load the model, use its full name instead:\n\n"
|
||||||
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
|
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
|
||||||
"models, see the models directory: https://spacy.io/models. If you "
|
"models, see the models directory: https://spacy.io/models and if "
|
||||||
"want to create a blank model, use spacy.blank: "
|
"you want to create a blank model, use spacy.blank: "
|
||||||
"nlp = spacy.blank(\"{name}\")")
|
"nlp = spacy.blank(\"{name}\")")
|
||||||
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
|
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
|
||||||
"return an initialized nlp object but got: {value}. Maybe "
|
"return an initialized nlp object but got: {value}. Maybe "
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .errors import Warnings
|
from .errors import Warnings
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
|
from .candidate import Candidate, get_candidates, get_candidates_batch
|
||||||
from .kb import KnowledgeBase
|
from .kb import KnowledgeBase
|
||||||
from .kb_in_memory import InMemoryLookupKB
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
from .kb cimport KnowledgeBase
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
|
|
||||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
|
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
from .kb cimport KnowledgeBase
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
from ..tokens import Span
|
from ..tokens import Span
|
||||||
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||||
|
|
|
@ -2,8 +2,10 @@
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from libc.stdint cimport int64_t
|
from libc.stdint cimport int64_t
|
||||||
|
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
|
|
@ -2,12 +2,13 @@
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Tuple, Union
|
from typing import Iterable, Tuple, Union
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .candidate import Candidate
|
from ..errors import Errors
|
||||||
from ..tokens import Span
|
from ..tokens import Span
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from ..errors import Errors
|
from .candidate import Candidate
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
"""Knowledge-base for entity or concept linking."""
|
"""Knowledge-base for entity or concept linking."""
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from libcpp.vector cimport vector
|
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
from libc.stdio cimport FILE
|
from libc.stdio cimport FILE
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
|
from ..structs cimport AliasC, KBEntryC
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
from ..structs cimport KBEntryC, AliasC
|
|
||||||
from .kb cimport KnowledgeBase
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
ctypedef vector[KBEntryC] entry_vec
|
ctypedef vector[KBEntryC] entry_vec
|
||||||
|
|
|
@ -1,23 +1,28 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Iterable, Callable, Dict, Any, Union
|
from typing import Any, Callable, Dict, Iterable, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from cpython.exc cimport PyErr_SetFromErrno
|
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
|
||||||
from libc.stdint cimport int32_t, int64_t
|
|
||||||
from libcpp.vector cimport vector
|
|
||||||
|
|
||||||
from pathlib import Path
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
|
from libc.stdint cimport int32_t, int64_t
|
||||||
|
from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from ..tokens import Span
|
from ..tokens import Span
|
||||||
|
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
from ..errors import Errors, Warnings
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..errors import Errors, Warnings
|
||||||
from ..util import SimpleFrozenList, ensure_path
|
from ..util import SimpleFrozenList, ensure_path
|
||||||
|
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from .kb cimport KnowledgeBase
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
from .candidate import Candidate as Candidate
|
from .candidate import Candidate as Candidate
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
class AfrikaansDefaults(BaseDefaults):
|
class AfrikaansDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...attrs import LANG
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from ...attrs import LANG
|
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
|
||||||
class AmharicDefaults(BaseDefaults):
|
class AmharicDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import (
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
ALPHA_UPPER,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
|
||||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import NORM, ORTH
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
class ArabicDefaults(BaseDefaults):
|
class ArabicDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import (
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
ALPHA_UPPER,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class AzerbaijaniDefaults(BaseDefaults):
|
class AzerbaijaniDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
# Eleven, twelve etc. are written separate: on bir, on iki
|
# Eleven, twelve etc. are written separate: on bir, on iki
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
|
from ...attrs import LANG
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..punctuation import (
|
||||||
|
COMBINING_DIACRITICS_TOKENIZER_INFIXES,
|
||||||
|
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
|
||||||
|
)
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
|
||||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from ...attrs import LANG
|
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
|
||||||
class BulgarianDefaults(BaseDefaults):
|
class BulgarianDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"нула",
|
"нула",
|
||||||
"едно",
|
"едно",
|
||||||
|
|
|
@ -4,8 +4,7 @@ References:
|
||||||
(countries, occupations, fields of studies and more).
|
(countries, occupations, fields of studies and more).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import NORM, ORTH
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(BaseDefaults):
|
class BengaliDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,14 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import (
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
|
||||||
_currency = r"\$¢£€¥฿৳"
|
_currency = r"\$¢£€¥฿৳"
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from .lemmatizer import CatalanLemmatizer
|
from .lemmatizer import CatalanLemmatizer
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(BaseDefaults):
|
class CatalanDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"zero",
|
"zero",
|
||||||
"un",
|
"un",
|
||||||
|
|
|
@ -1,9 +1,18 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import (
|
||||||
from ..char_classes import LIST_CURRENCY
|
ALPHA,
|
||||||
from ..char_classes import CURRENCY
|
ALPHA_LOWER,
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
ALPHA_UPPER,
|
||||||
from ..char_classes import merge_chars, _units
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
PUNCT,
|
||||||
|
_units,
|
||||||
|
merge_chars,
|
||||||
|
)
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
from ...tokens import Doc, Span
|
|
||||||
from ...symbols import NOUN, PROPN
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PROPN
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class CzechDefaults(BaseDefaults):
|
class CzechDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language, BaseDefaults
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(BaseDefaults):
|
class DanishDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
# Source http://fjern-uv.dk/tal.php
|
# Source http://fjern-uv.dk/tal.php
|
||||||
_num_words = """nul
|
_num_words = """nul
|
||||||
en et to tre fire fem seks syv otte ni ti
|
en et to tre fire fem seks syv otte ni ti
|
||||||
|
|
|
@ -1,8 +1,13 @@
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
from ..char_classes import (
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
)
|
||||||
from ..punctuation import TOKENIZER_SUFFIXES
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
from ...tokens import Doc, Span
|
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import AUX, NOUN, PRON, PROPN, VERB
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
|
|
@ -2,10 +2,9 @@
|
||||||
Tokenizer Exceptions.
|
Tokenizer Exceptions.
|
||||||
Source: https://forkortelse.dk/ and various others.
|
Source: https://forkortelse.dk/ and various others.
|
||||||
"""
|
"""
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language, BaseDefaults
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(BaseDefaults):
|
class GermanDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,9 +1,18 @@
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
from ..char_classes import (
|
||||||
from ..char_classes import CURRENCY, UNITS, PUNCT
|
ALPHA,
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
PUNCT,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
|
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
|
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
class LowerSorbianDefaults(BaseDefaults):
|
class LowerSorbianDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from ...language import Language, BaseDefaults
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(BaseDefaults):
|
class GreekDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
def get_pos_from_wiktionary():
|
def get_pos_from_wiktionary():
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from gensim.corpora.wikicorpus import extract_pages
|
from gensim.corpora.wikicorpus import extract_pages
|
||||||
|
|
||||||
regex = re.compile(r"==={{(\w+)\|el}}===")
|
regex = re.compile(r"==={{(\w+)\|el}}===")
|
||||||
|
|
|
@ -1,6 +1,16 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
from ..char_classes import (
|
||||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
ALPHA,
|
||||||
from ..char_classes import CONCAT_QUOTES, CURRENCY
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
)
|
||||||
|
|
||||||
_units = (
|
_units = (
|
||||||
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
|
||||||
from .lemmatizer import EnglishLemmatizer
|
from .lemmatizer import EnglishLemmatizer
|
||||||
from ...language import Language, BaseDefaults
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(BaseDefaults):
|
class EnglishDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,12 @@
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
from ..char_classes import (
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
)
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
from ...symbols import NORM, ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc: Dict[str, List[Dict]] = {}
|
_exc: Dict[str, List[Dict]] = {}
|
||||||
_exclude = [
|
_exclude = [
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .lemmatizer import SpanishLemmatizer
|
from .lemmatizer import SpanishLemmatizer
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ...language import Language, BaseDefaults
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class SpanishDefaults(BaseDefaults):
|
class SpanishDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import List, Optional, Tuple
|
|
||||||
import re
|
import re
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"cero",
|
"cero",
|
||||||
"uno",
|
"uno",
|
||||||
|
|
|
@ -1,8 +1,17 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
from ..char_classes import (
|
||||||
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
|
ALPHA,
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
ALPHA_LOWER,
|
||||||
from ..char_classes import merge_chars
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
LIST_UNITS,
|
||||||
|
PUNCT,
|
||||||
|
merge_chars,
|
||||||
|
)
|
||||||
|
|
||||||
_list_units = [u for u in LIST_UNITS if u != "%"]
|
_list_units = [u for u in LIST_UNITS if u != "%"]
|
||||||
_units = merge_chars(" ".join(_list_units))
|
_units = merge_chars(" ".join(_list_units))
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
"pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
|
"pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
class EstonianDefaults(BaseDefaults):
|
class EstonianDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from ...language import Language, BaseDefaults
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class BasqueDefaults(BaseDefaults):
|
class BasqueDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
from ..punctuation import TOKENIZER_SUFFIXES
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
_suffixes = TOKENIZER_SUFFIXES
|
_suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
from ...language import BaseDefaults, Language
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class PersianDefaults(BaseDefaults):
|
class PersianDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
MIM = "م"
|
MIM = "م"
|
||||||
ZWNJ_O_MIM = "ام"
|
ZWNJ_O_MIM = "ام"
|
||||||
YE_NUN = "ین"
|
YE_NUN = "ین"
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import (
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
ALPHA_UPPER,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
from ...tokens import Doc, Span
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user