mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'ci/add-cython-linter' of github.com:bdura/spaCy into ci/add-cython-linter
This commit is contained in:
		
						commit
						6c483971b7
					
				
							
								
								
									
										4
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -37,6 +37,10 @@ jobs:
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          python -m pip install black -c requirements.txt
 | 
					          python -m pip install black -c requirements.txt
 | 
				
			||||||
          python -m black spacy --check
 | 
					          python -m black spacy --check
 | 
				
			||||||
 | 
					      - name: isort
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pip install isort -c requirements.txt
 | 
				
			||||||
 | 
					          python -m isort spacy --check
 | 
				
			||||||
      - name: flake8
 | 
					      - name: flake8
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          python -m pip install flake8==5.0.4
 | 
					          python -m pip install flake8==5.0.4
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,3 +9,6 @@ requires = [
 | 
				
			||||||
    "numpy>=1.15.0",
 | 
					    "numpy>=1.15.0",
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
build-backend = "setuptools.build_meta"
 | 
					build-backend = "setuptools.build_meta"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[tool.isort]
 | 
				
			||||||
 | 
					profile = "black"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -39,3 +39,4 @@ types-requests
 | 
				
			||||||
types-setuptools>=57.0.0
 | 
					types-setuptools>=57.0.0
 | 
				
			||||||
black==22.3.0
 | 
					black==22.3.0
 | 
				
			||||||
cython-lint>=0.15.0
 | 
					cython-lint>=0.15.0
 | 
				
			||||||
 | 
					isort>=5.0,<6.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from typing import Union, Iterable, Dict, Any
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Iterable, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# set library-specific custom warning handling before doing anything else
 | 
					# set library-specific custom warning handling before doing anything else
 | 
				
			||||||
from .errors import setup_default_warnings
 | 
					from .errors import setup_default_warnings
 | 
				
			||||||
| 
						 | 
					@ -8,20 +8,17 @@ from .errors import setup_default_warnings
 | 
				
			||||||
setup_default_warnings()  # noqa: E402
 | 
					setup_default_warnings()  # noqa: E402
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# These are imported as part of the API
 | 
					# These are imported as part of the API
 | 
				
			||||||
from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
 | 
					from thinc.api import Config, prefer_gpu, require_cpu, require_gpu  # noqa: F401
 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import pipeline  # noqa: F401
 | 
					from . import pipeline  # noqa: F401
 | 
				
			||||||
from .cli.info import info  # noqa: F401
 | 
					 | 
				
			||||||
from .glossary import explain  # noqa: F401
 | 
					 | 
				
			||||||
from .about import __version__  # noqa: F401
 | 
					 | 
				
			||||||
from .util import registry, logger  # noqa: F401
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .errors import Errors
 | 
					 | 
				
			||||||
from .language import Language
 | 
					 | 
				
			||||||
from .vocab import Vocab
 | 
					 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
 | 
					from .about import __version__  # noqa: F401
 | 
				
			||||||
 | 
					from .cli.info import info  # noqa: F401
 | 
				
			||||||
 | 
					from .errors import Errors
 | 
				
			||||||
 | 
					from .glossary import explain  # noqa: F401
 | 
				
			||||||
 | 
					from .language import Language
 | 
				
			||||||
 | 
					from .util import logger, registry  # noqa: F401
 | 
				
			||||||
 | 
					from .vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if sys.maxunicode == 65535:
 | 
					if sys.maxunicode == 65535:
 | 
				
			||||||
    raise SystemError(Errors.E130)
 | 
					    raise SystemError(Errors.E130)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
__title__ = "spacy"
 | 
					__title__ = "spacy"
 | 
				
			||||||
__version__ = "3.6.0.dev0"
 | 
					__version__ = "3.6.0"
 | 
				
			||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
					__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
				
			||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
					__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
				
			||||||
__projects__ = "https://github.com/explosion/projects"
 | 
					__projects__ = "https://github.com/explosion/projects"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,7 @@
 | 
				
			||||||
# Reserve 64 values for flag features
 | 
					# Reserve 64 values for flag features
 | 
				
			||||||
from . cimport symbols
 | 
					from . cimport symbols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef enum attr_id_t:
 | 
					cdef enum attr_id_t:
 | 
				
			||||||
    NULL_ATTR
 | 
					    NULL_ATTR
 | 
				
			||||||
    IS_ALPHA
 | 
					    IS_ALPHA
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,35 +1,35 @@
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, setup_cli  # noqa: F401
 | 
					from ._util import app, setup_cli  # noqa: F401
 | 
				
			||||||
 | 
					from .apply import apply  # noqa: F401
 | 
				
			||||||
 | 
					from .assemble import assemble_cli  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
 | 
					# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
 | 
				
			||||||
# are registered automatically and won't have to be imported here.
 | 
					# are registered automatically and won't have to be imported here.
 | 
				
			||||||
from .benchmark_speed import benchmark_speed_cli  # noqa: F401
 | 
					from .benchmark_speed import benchmark_speed_cli  # noqa: F401
 | 
				
			||||||
from .download import download  # noqa: F401
 | 
					 | 
				
			||||||
from .info import info  # noqa: F401
 | 
					 | 
				
			||||||
from .package import package  # noqa: F401
 | 
					 | 
				
			||||||
from .profile import profile  # noqa: F401
 | 
					 | 
				
			||||||
from .train import train_cli  # noqa: F401
 | 
					 | 
				
			||||||
from .assemble import assemble_cli  # noqa: F401
 | 
					 | 
				
			||||||
from .pretrain import pretrain  # noqa: F401
 | 
					 | 
				
			||||||
from .debug_data import debug_data  # noqa: F401
 | 
					 | 
				
			||||||
from .debug_config import debug_config  # noqa: F401
 | 
					 | 
				
			||||||
from .debug_model import debug_model  # noqa: F401
 | 
					 | 
				
			||||||
from .debug_diff import debug_diff  # noqa: F401
 | 
					 | 
				
			||||||
from .evaluate import evaluate  # noqa: F401
 | 
					 | 
				
			||||||
from .apply import apply  # noqa: F401
 | 
					 | 
				
			||||||
from .convert import convert  # noqa: F401
 | 
					from .convert import convert  # noqa: F401
 | 
				
			||||||
from .init_pipeline import init_pipeline_cli  # noqa: F401
 | 
					from .debug_config import debug_config  # noqa: F401
 | 
				
			||||||
from .init_config import init_config, fill_config  # noqa: F401
 | 
					from .debug_data import debug_data  # noqa: F401
 | 
				
			||||||
from .validate import validate  # noqa: F401
 | 
					from .debug_diff import debug_diff  # noqa: F401
 | 
				
			||||||
from .project.clone import project_clone  # noqa: F401
 | 
					from .debug_model import debug_model  # noqa: F401
 | 
				
			||||||
from .project.assets import project_assets  # noqa: F401
 | 
					from .download import download  # noqa: F401
 | 
				
			||||||
from .project.run import project_run  # noqa: F401
 | 
					from .evaluate import evaluate  # noqa: F401
 | 
				
			||||||
from .project.dvc import project_update_dvc  # noqa: F401
 | 
					 | 
				
			||||||
from .project.push import project_push  # noqa: F401
 | 
					 | 
				
			||||||
from .project.pull import project_pull  # noqa: F401
 | 
					 | 
				
			||||||
from .project.document import project_document  # noqa: F401
 | 
					 | 
				
			||||||
from .find_threshold import find_threshold  # noqa: F401
 | 
					from .find_threshold import find_threshold  # noqa: F401
 | 
				
			||||||
 | 
					from .info import info  # noqa: F401
 | 
				
			||||||
 | 
					from .init_config import fill_config, init_config  # noqa: F401
 | 
				
			||||||
 | 
					from .init_pipeline import init_pipeline_cli  # noqa: F401
 | 
				
			||||||
 | 
					from .package import package  # noqa: F401
 | 
				
			||||||
 | 
					from .pretrain import pretrain  # noqa: F401
 | 
				
			||||||
 | 
					from .profile import profile  # noqa: F401
 | 
				
			||||||
 | 
					from .project.assets import project_assets  # noqa: F401
 | 
				
			||||||
 | 
					from .project.clone import project_clone  # noqa: F401
 | 
				
			||||||
 | 
					from .project.document import project_document  # noqa: F401
 | 
				
			||||||
 | 
					from .project.dvc import project_update_dvc  # noqa: F401
 | 
				
			||||||
 | 
					from .project.pull import project_pull  # noqa: F401
 | 
				
			||||||
 | 
					from .project.push import project_push  # noqa: F401
 | 
				
			||||||
 | 
					from .project.run import project_run  # noqa: F401
 | 
				
			||||||
 | 
					from .train import train_cli  # noqa: F401
 | 
				
			||||||
 | 
					from .validate import validate  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
 | 
					@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,44 @@
 | 
				
			||||||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
 | 
					 | 
				
			||||||
from typing import TYPE_CHECKING, overload
 | 
					 | 
				
			||||||
import sys
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg, Printer
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
import hashlib
 | 
					import hashlib
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import shutil
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					from configparser import InterpolationError
 | 
				
			||||||
 | 
					from contextlib import contextmanager
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import (
 | 
				
			||||||
 | 
					    TYPE_CHECKING,
 | 
				
			||||||
 | 
					    Any,
 | 
				
			||||||
 | 
					    Dict,
 | 
				
			||||||
 | 
					    Iterable,
 | 
				
			||||||
 | 
					    List,
 | 
				
			||||||
 | 
					    Optional,
 | 
				
			||||||
 | 
					    Tuple,
 | 
				
			||||||
 | 
					    Union,
 | 
				
			||||||
 | 
					    overload,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
from click import NoSuchOption
 | 
					from click import NoSuchOption
 | 
				
			||||||
from click.parser import split_arg_string
 | 
					from click.parser import split_arg_string
 | 
				
			||||||
from typer.main import get_command
 | 
					 | 
				
			||||||
from contextlib import contextmanager
 | 
					 | 
				
			||||||
from thinc.api import Config, ConfigValidationError, require_gpu
 | 
					from thinc.api import Config, ConfigValidationError, require_gpu
 | 
				
			||||||
from thinc.util import gpu_is_available
 | 
					from thinc.util import gpu_is_available
 | 
				
			||||||
from configparser import InterpolationError
 | 
					from typer.main import get_command
 | 
				
			||||||
import os
 | 
					from wasabi import Printer, msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .. import about
 | 
				
			||||||
from ..compat import Literal
 | 
					from ..compat import Literal
 | 
				
			||||||
from ..schemas import ProjectConfigSchema, validate
 | 
					from ..schemas import ProjectConfigSchema, validate
 | 
				
			||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
 | 
					from ..util import (
 | 
				
			||||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 | 
					    ENV_VARS,
 | 
				
			||||||
from .. import about
 | 
					    SimpleFrozenDict,
 | 
				
			||||||
 | 
					    import_file,
 | 
				
			||||||
 | 
					    is_compatible_version,
 | 
				
			||||||
 | 
					    logger,
 | 
				
			||||||
 | 
					    make_tempdir,
 | 
				
			||||||
 | 
					    registry,
 | 
				
			||||||
 | 
					    run_command,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if TYPE_CHECKING:
 | 
					if TYPE_CHECKING:
 | 
				
			||||||
    from pathy import FluidPath  # noqa: F401
 | 
					    from pathy import FluidPath  # noqa: F401
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,15 @@
 | 
				
			||||||
import tqdm
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from itertools import chain
 | 
					from itertools import chain
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Optional, List, Iterable, cast, Union
 | 
					from typing import Iterable, List, Optional, Union, cast
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					import tqdm
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokens import Doc, DocBin
 | 
					from ..tokens import Doc, DocBin
 | 
				
			||||||
from ..vocab import Vocab
 | 
					 | 
				
			||||||
from ..util import ensure_path, load_model
 | 
					from ..util import ensure_path, load_model
 | 
				
			||||||
 | 
					from ..vocab import Vocab
 | 
				
			||||||
 | 
					from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
path_help = """Location of the documents to predict on.
 | 
					path_help = """Location of the documents to predict on.
 | 
				
			||||||
Can be a single file in .spacy format or a .jsonl file.
 | 
					Can be a single file in .spacy format or a .jsonl file.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,20 @@
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					 | 
				
			||||||
from ._util import import_code
 | 
					 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..util import get_sourced_components, load_model_from_config
 | 
					from ..util import get_sourced_components, load_model_from_config
 | 
				
			||||||
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    app,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,12 @@
 | 
				
			||||||
from typing import Iterable, List, Optional
 | 
					 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
from itertools import islice
 | 
					 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
from tqdm import tqdm
 | 
					from itertools import islice
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Iterable, List, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					from tqdm import tqdm
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,22 @@
 | 
				
			||||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
 | 
					import itertools
 | 
				
			||||||
from enum import Enum
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import Printer
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import itertools
 | 
					from enum import Enum
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Callable, Iterable, Mapping, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, walk_directory
 | 
					 | 
				
			||||||
from ..training import docs_to_json
 | 
					 | 
				
			||||||
from ..tokens import Doc, DocBin
 | 
					from ..tokens import Doc, DocBin
 | 
				
			||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
 | 
					from ..training import docs_to_json
 | 
				
			||||||
from ..training.converters import conllu_to_docs
 | 
					from ..training.converters import (
 | 
				
			||||||
 | 
					    conll_ner_to_docs,
 | 
				
			||||||
 | 
					    conllu_to_docs,
 | 
				
			||||||
 | 
					    iob_to_docs,
 | 
				
			||||||
 | 
					    json_to_docs,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from ._util import Arg, Opt, app, walk_directory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Converters are matched by file extension except for ner/iob, which are
 | 
					# Converters are matched by file extension except for ner/iob, which are
 | 
				
			||||||
# matched by file extension and content. To add a converter, add a new
 | 
					# matched by file extension and content. To add a converter, add a new
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,22 @@
 | 
				
			||||||
from typing import Optional, Dict, Any, Union, List
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg, table
 | 
					from typing import Any, Dict, List, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
from thinc.config import VARIABLE_RE
 | 
					from thinc.config import VARIABLE_RE
 | 
				
			||||||
import typer
 | 
					from wasabi import msg, table
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
 | 
					from .. import util
 | 
				
			||||||
from ._util import import_code, debug_cli
 | 
					 | 
				
			||||||
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
 | 
					from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
 | 
				
			||||||
from ..util import registry
 | 
					from ..util import registry
 | 
				
			||||||
from .. import util
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    debug_cli,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@debug_cli.command(
 | 
					@debug_cli.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,31 +1,49 @@
 | 
				
			||||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
 | 
					 | 
				
			||||||
from typing import cast, overload
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from collections import Counter
 | 
					 | 
				
			||||||
import sys
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
from wasabi import Printer, MESSAGES, msg
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
import math
 | 
					import math
 | 
				
			||||||
import numpy
 | 
					import sys
 | 
				
			||||||
 | 
					from collections import Counter
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import (
 | 
				
			||||||
 | 
					    Any,
 | 
				
			||||||
 | 
					    Dict,
 | 
				
			||||||
 | 
					    Iterable,
 | 
				
			||||||
 | 
					    List,
 | 
				
			||||||
 | 
					    Optional,
 | 
				
			||||||
 | 
					    Sequence,
 | 
				
			||||||
 | 
					    Set,
 | 
				
			||||||
 | 
					    Tuple,
 | 
				
			||||||
 | 
					    Union,
 | 
				
			||||||
 | 
					    cast,
 | 
				
			||||||
 | 
					    overload,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 | 
					import numpy
 | 
				
			||||||
from ._util import import_code, debug_cli, _format_number
 | 
					import srsly
 | 
				
			||||||
from ..training import Example, remove_bilu_prefix
 | 
					import typer
 | 
				
			||||||
from ..training.initialize import get_sourced_components
 | 
					from wasabi import MESSAGES, Printer, msg
 | 
				
			||||||
from ..schemas import ConfigSchemaTraining
 | 
					
 | 
				
			||||||
from ..pipeline import TrainablePipe
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..compat import Literal
 | 
				
			||||||
 | 
					from ..language import Language
 | 
				
			||||||
 | 
					from ..morphology import Morphology
 | 
				
			||||||
 | 
					from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
 | 
				
			||||||
 | 
					from ..pipeline._edit_tree_internals.edit_trees import EditTrees
 | 
				
			||||||
from ..pipeline._parser_internals import nonproj
 | 
					from ..pipeline._parser_internals import nonproj
 | 
				
			||||||
from ..pipeline._parser_internals.nonproj import DELIMITER
 | 
					from ..pipeline._parser_internals.nonproj import DELIMITER
 | 
				
			||||||
from ..pipeline import Morphologizer, SpanCategorizer
 | 
					from ..schemas import ConfigSchemaTraining
 | 
				
			||||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
 | 
					from ..training import Example, remove_bilu_prefix
 | 
				
			||||||
from ..morphology import Morphology
 | 
					from ..training.initialize import get_sourced_components
 | 
				
			||||||
from ..language import Language
 | 
					 | 
				
			||||||
from ..util import registry, resolve_dot_names
 | 
					from ..util import registry, resolve_dot_names
 | 
				
			||||||
from ..compat import Literal
 | 
					 | 
				
			||||||
from ..vectors import Mode as VectorsMode
 | 
					from ..vectors import Mode as VectorsMode
 | 
				
			||||||
from .. import util
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    _format_number,
 | 
				
			||||||
 | 
					    app,
 | 
				
			||||||
 | 
					    debug_cli,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Minimum number of expected occurrences of NER label in data to train new label
 | 
					# Minimum number of expected occurrences of NER label in data to train new label
 | 
				
			||||||
NEW_LABEL_THRESHOLD = 50
 | 
					NEW_LABEL_THRESHOLD = 50
 | 
				
			||||||
| 
						 | 
					@ -212,7 +230,7 @@ def debug_data(
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.info("No word vectors present in the package")
 | 
					        msg.info("No word vectors present in the package")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if "spancat" in factory_names:
 | 
					    if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
 | 
				
			||||||
        model_labels_spancat = _get_labels_from_spancat(nlp)
 | 
					        model_labels_spancat = _get_labels_from_spancat(nlp)
 | 
				
			||||||
        has_low_data_warning = False
 | 
					        has_low_data_warning = False
 | 
				
			||||||
        has_no_neg_warning = False
 | 
					        has_no_neg_warning = False
 | 
				
			||||||
| 
						 | 
					@ -830,7 +848,7 @@ def _compile_gold(
 | 
				
			||||||
                    data["boundary_cross_ents"] += 1
 | 
					                    data["boundary_cross_ents"] += 1
 | 
				
			||||||
                elif label == "-":
 | 
					                elif label == "-":
 | 
				
			||||||
                    data["ner"]["-"] += 1
 | 
					                    data["ner"]["-"] += 1
 | 
				
			||||||
        if "spancat" in factory_names:
 | 
					        if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
 | 
				
			||||||
            for spans_key in list(eg.reference.spans.keys()):
 | 
					            for spans_key in list(eg.reference.spans.keys()):
 | 
				
			||||||
                # Obtain the span frequency
 | 
					                # Obtain the span frequency
 | 
				
			||||||
                if spans_key not in data["spancat"]:
 | 
					                if spans_key not in data["spancat"]:
 | 
				
			||||||
| 
						 | 
					@ -1028,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
 | 
				
			||||||
    pipe_names = [
 | 
					    pipe_names = [
 | 
				
			||||||
        pipe_name
 | 
					        pipe_name
 | 
				
			||||||
        for pipe_name in nlp.pipe_names
 | 
					        for pipe_name in nlp.pipe_names
 | 
				
			||||||
        if nlp.get_pipe_meta(pipe_name).factory == "spancat"
 | 
					        if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    labels: Dict[str, Set[str]] = {}
 | 
					    labels: Dict[str, Set[str]] = {}
 | 
				
			||||||
    for pipe_name in pipe_names:
 | 
					    for pipe_name in pipe_names:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,13 @@
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Optional
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
from wasabi import Printer, diff_strings, MarkdownRenderer
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					from wasabi import MarkdownRenderer, Printer, diff_strings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
 | 
					 | 
				
			||||||
from ..util import load_config
 | 
					from ..util import load_config
 | 
				
			||||||
from .init_config import init_config, Optimizations
 | 
					from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
 | 
				
			||||||
 | 
					from .init_config import Optimizations, init_config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@debug_cli.command(
 | 
					@debug_cli.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,19 +1,32 @@
 | 
				
			||||||
from typing import Dict, Any, Optional
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from thinc.api import (
 | 
				
			||||||
 | 
					    Model,
 | 
				
			||||||
 | 
					    data_validation,
 | 
				
			||||||
 | 
					    fix_random_seed,
 | 
				
			||||||
 | 
					    set_dropout_rate,
 | 
				
			||||||
 | 
					    set_gpu_allocator,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.training import Example
 | 
					from spacy.training import Example
 | 
				
			||||||
from spacy.util import resolve_dot_names
 | 
					from spacy.util import resolve_dot_names
 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
from thinc.api import fix_random_seed, set_dropout_rate
 | 
					 | 
				
			||||||
from thinc.api import Model, data_validation, set_gpu_allocator
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
 | 
					from .. import util
 | 
				
			||||||
from ._util import parse_config_overrides, string_to_list, setup_gpu
 | 
					 | 
				
			||||||
from ..schemas import ConfigSchemaTraining
 | 
					from ..schemas import ConfigSchemaTraining
 | 
				
			||||||
from ..util import registry
 | 
					from ..util import registry
 | 
				
			||||||
from .. import util
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    debug_cli,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    setup_gpu,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					    string_to_list,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@debug_cli.command(
 | 
					@debug_cli.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,14 @@
 | 
				
			||||||
from typing import Optional, Sequence
 | 
					 | 
				
			||||||
import requests
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
from wasabi import msg
 | 
					from typing import Optional, Sequence
 | 
				
			||||||
import typer
 | 
					
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
 | 
					 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
from ..util import is_package, get_minor_version, run_command
 | 
					 | 
				
			||||||
from ..util import is_prerelease_version
 | 
					 | 
				
			||||||
from ..errors import OLD_MODEL_SHORTCUTS
 | 
					from ..errors import OLD_MODEL_SHORTCUTS
 | 
				
			||||||
 | 
					from ..util import get_minor_version, is_package, is_prerelease_version, run_command
 | 
				
			||||||
 | 
					from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,16 +1,16 @@
 | 
				
			||||||
from typing import Optional, List, Dict, Any, Union
 | 
					 | 
				
			||||||
from wasabi import Printer
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, List, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from thinc.api import fix_random_seed
 | 
					from thinc.api import fix_random_seed
 | 
				
			||||||
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..training import Corpus
 | 
					from .. import displacy, util
 | 
				
			||||||
from ..tokens import Doc
 | 
					 | 
				
			||||||
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
 | 
					 | 
				
			||||||
from ..scorer import Scorer
 | 
					from ..scorer import Scorer
 | 
				
			||||||
from .. import util
 | 
					from ..tokens import Doc
 | 
				
			||||||
from .. import displacy
 | 
					from ..training import Corpus
 | 
				
			||||||
 | 
					from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@benchmark_cli.command(
 | 
					@benchmark_cli.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,17 @@
 | 
				
			||||||
import functools
 | 
					import functools
 | 
				
			||||||
 | 
					import logging
 | 
				
			||||||
import operator
 | 
					import operator
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
import logging
 | 
					from typing import Any, Dict, List, Optional, Tuple
 | 
				
			||||||
from typing import Optional, Tuple, Any, Dict, List
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import wasabi.tables
 | 
					import wasabi.tables
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
 | 
					 | 
				
			||||||
from ..errors import Errors
 | 
					 | 
				
			||||||
from ..training import Corpus
 | 
					 | 
				
			||||||
from ._util import app, Arg, Opt, import_code, setup_gpu
 | 
					 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
 | 
				
			||||||
 | 
					from ..training import Corpus
 | 
				
			||||||
 | 
					from ._util import Arg, Opt, app, import_code, setup_gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_DEFAULTS = {
 | 
					_DEFAULTS = {
 | 
				
			||||||
    "n_trials": 11,
 | 
					    "n_trials": 11,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,15 @@
 | 
				
			||||||
from typing import Optional, Dict, Any, Union, List
 | 
					 | 
				
			||||||
import platform
 | 
					 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
 | 
					import platform
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import Printer, MarkdownRenderer
 | 
					from typing import Any, Dict, List, Optional, Union
 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, string_to_list
 | 
					import srsly
 | 
				
			||||||
from .download import get_model_filename, get_latest_version
 | 
					from wasabi import MarkdownRenderer, Printer
 | 
				
			||||||
from .. import util
 | 
					
 | 
				
			||||||
from .. import about
 | 
					from .. import about, util
 | 
				
			||||||
from ..compat import importlib_metadata
 | 
					from ..compat import importlib_metadata
 | 
				
			||||||
 | 
					from ._util import Arg, Opt, app, string_to_list
 | 
				
			||||||
 | 
					from .download import get_latest_version, get_model_filename
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("info")
 | 
					@app.command("info")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,19 +1,26 @@
 | 
				
			||||||
from typing import Optional, List, Tuple
 | 
					import re
 | 
				
			||||||
from enum import Enum
 | 
					from enum import Enum
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import Printer, diff_strings
 | 
					from typing import List, Optional, Tuple
 | 
				
			||||||
from thinc.api import Config
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
from jinja2 import Template
 | 
					from jinja2 import Template
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					from wasabi import Printer, diff_strings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
 | 
					from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
 | 
				
			||||||
from ..schemas import RecommendationSchema
 | 
					from ..schemas import RecommendationSchema
 | 
				
			||||||
from ..util import SimpleFrozenList
 | 
					from ..util import SimpleFrozenList
 | 
				
			||||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
 | 
					from ._util import (
 | 
				
			||||||
from ._util import string_to_list, import_code
 | 
					    COMMAND,
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    init_cli,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					    string_to_list,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ROOT = Path(__file__).parent / "templates"
 | 
					ROOT = Path(__file__).parent / "templates"
 | 
				
			||||||
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
 | 
					TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,23 @@
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg
 | 
					from typing import Optional
 | 
				
			||||||
import typer
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..training.initialize import init_nlp, convert_vectors
 | 
					 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					from ..training.initialize import convert_vectors, init_nlp
 | 
				
			||||||
from ._util import import_code, setup_gpu
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    init_cli,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    setup_gpu,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@init_cli.command("vectors")
 | 
					@init_cli.command("vectors")
 | 
				
			||||||
| 
						 | 
					@ -24,6 +32,7 @@ def init_vectors_cli(
 | 
				
			||||||
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
 | 
					    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
 | 
				
			||||||
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
					    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
				
			||||||
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
 | 
					    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
 | 
				
			||||||
 | 
					    attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """Convert word vectors for use with spaCy. Will export an nlp object that
 | 
					    """Convert word vectors for use with spaCy. Will export an nlp object that
 | 
				
			||||||
| 
						 | 
					@ -42,6 +51,7 @@ def init_vectors_cli(
 | 
				
			||||||
        prune=prune,
 | 
					        prune=prune,
 | 
				
			||||||
        name=name,
 | 
					        name=name,
 | 
				
			||||||
        mode=mode,
 | 
					        mode=mode,
 | 
				
			||||||
 | 
					        attr=attr,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
 | 
					    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
 | 
				
			||||||
    nlp.to_disk(output_dir)
 | 
					    nlp.to_disk(output_dir)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,18 @@
 | 
				
			||||||
from typing import Optional, Union, Any, Dict, List, Tuple, cast
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import Printer, MarkdownRenderer, get_raw_input
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
from collections import defaultdict
 | 
					 | 
				
			||||||
from catalogue import RegistryError
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
import sys
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					import shutil
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					from collections import defaultdict
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, List, Optional, Tuple, Union, cast
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
 | 
					import srsly
 | 
				
			||||||
from ..schemas import validate, ModelMetaSchema
 | 
					from catalogue import RegistryError
 | 
				
			||||||
from .. import util
 | 
					from thinc.api import Config
 | 
				
			||||||
from .. import about
 | 
					from wasabi import MarkdownRenderer, Printer, get_raw_input
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .. import about, util
 | 
				
			||||||
 | 
					from ..schemas import ModelMetaSchema, validate
 | 
				
			||||||
 | 
					from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("package")
 | 
					@app.command("package")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,21 @@
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					 | 
				
			||||||
from ._util import import_code, setup_gpu
 | 
					 | 
				
			||||||
from ..training.pretrain import pretrain
 | 
					from ..training.pretrain import pretrain
 | 
				
			||||||
from ..util import load_config
 | 
					from ..util import load_config
 | 
				
			||||||
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    app,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    setup_gpu,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,18 @@
 | 
				
			||||||
from typing import Optional, Sequence, Union, Iterator
 | 
					 | 
				
			||||||
import tqdm
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
import cProfile
 | 
					import cProfile
 | 
				
			||||||
 | 
					import itertools
 | 
				
			||||||
import pstats
 | 
					import pstats
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import itertools
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg, Printer
 | 
					from typing import Iterator, Optional, Sequence, Union
 | 
				
			||||||
import typer
 | 
					
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					import tqdm
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import Printer, msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, debug_cli, Arg, Opt, NAME
 | 
					 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..util import load_model
 | 
					from ..util import load_model
 | 
				
			||||||
 | 
					from ._util import NAME, Arg, Opt, app, debug_cli
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@debug_cli.command("profile")
 | 
					@debug_cli.command("profile")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,16 +1,27 @@
 | 
				
			||||||
from typing import Any, Dict, Optional
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import requests
 | 
					import requests
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...util import ensure_path, working_dir
 | 
					from ...util import ensure_path, working_dir
 | 
				
			||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
 | 
					from .._util import (
 | 
				
			||||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
 | 
					    PROJECT_FILE,
 | 
				
			||||||
from .._util import SimpleFrozenDict, parse_config_overrides
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    SimpleFrozenDict,
 | 
				
			||||||
 | 
					    download_file,
 | 
				
			||||||
 | 
					    get_checksum,
 | 
				
			||||||
 | 
					    get_git_version,
 | 
				
			||||||
 | 
					    git_checkout,
 | 
				
			||||||
 | 
					    load_project_config,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    project_cli,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Whether assets are extra if `extra` is not set.
 | 
					# Whether assets are extra if `extra` is not set.
 | 
				
			||||||
EXTRA_DEFAULT = False
 | 
					EXTRA_DEFAULT = False
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,22 @@
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
import subprocess
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					import subprocess
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ... import about
 | 
					from ... import about
 | 
				
			||||||
from ...util import ensure_path
 | 
					from ...util import ensure_path
 | 
				
			||||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
 | 
					from .._util import (
 | 
				
			||||||
from .._util import git_checkout, get_git_version, git_repo_branch_exists
 | 
					    COMMAND,
 | 
				
			||||||
 | 
					    PROJECT_FILE,
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    get_git_version,
 | 
				
			||||||
 | 
					    git_checkout,
 | 
				
			||||||
 | 
					    git_repo_branch_exists,
 | 
				
			||||||
 | 
					    project_cli,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_REPO = about.__projects__
 | 
					DEFAULT_REPO = about.__projects__
 | 
				
			||||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
 | 
					DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg, MarkdownRenderer
 | 
					
 | 
				
			||||||
 | 
					from wasabi import MarkdownRenderer, msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...util import working_dir
 | 
					from ...util import working_dir
 | 
				
			||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
 | 
					from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
DOCS_URL = "https://spacy.io"
 | 
					DOCS_URL = "https://spacy.io"
 | 
				
			||||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
 | 
					INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,28 @@
 | 
				
			||||||
"""This module contains helpers and subcommands for integrating spaCy projects
 | 
					"""This module contains helpers and subcommands for integrating spaCy projects
 | 
				
			||||||
with Data Version Controk (DVC). https://dvc.org"""
 | 
					with Data Version Controk (DVC). https://dvc.org"""
 | 
				
			||||||
from typing import Dict, Any, List, Optional, Iterable
 | 
					 | 
				
			||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Iterable, List, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
 | 
					from ...util import (
 | 
				
			||||||
from .._util import Arg, Opt, NAME, COMMAND
 | 
					    SimpleFrozenList,
 | 
				
			||||||
from ...util import working_dir, split_command, join_command, run_command
 | 
					    join_command,
 | 
				
			||||||
from ...util import SimpleFrozenList
 | 
					    run_command,
 | 
				
			||||||
 | 
					    split_command,
 | 
				
			||||||
 | 
					    working_dir,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from .._util import (
 | 
				
			||||||
 | 
					    COMMAND,
 | 
				
			||||||
 | 
					    NAME,
 | 
				
			||||||
 | 
					    PROJECT_FILE,
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    get_hash,
 | 
				
			||||||
 | 
					    load_project_config,
 | 
				
			||||||
 | 
					    project_cli,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DVC_CONFIG = "dvc.yaml"
 | 
					DVC_CONFIG = "dvc.yaml"
 | 
				
			||||||
DVC_DIR = ".dvc"
 | 
					DVC_DIR = ".dvc"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
from .remote_storage import RemoteStorage
 | 
					
 | 
				
			||||||
from .remote_storage import get_command_hash
 | 
					from .._util import Arg, load_project_config, logger, project_cli
 | 
				
			||||||
from .._util import project_cli, Arg, logger
 | 
					from .remote_storage import RemoteStorage, get_command_hash
 | 
				
			||||||
from .._util import load_project_config
 | 
					 | 
				
			||||||
from .run import update_lockfile
 | 
					from .run import update_lockfile
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
from .remote_storage import RemoteStorage
 | 
					
 | 
				
			||||||
from .remote_storage import get_content_hash, get_command_hash
 | 
					from .._util import Arg, load_project_config, logger, project_cli
 | 
				
			||||||
from .._util import load_project_config
 | 
					from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
 | 
				
			||||||
from .._util import project_cli, Arg, logger
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@project_cli.command("push")
 | 
					@project_cli.command("push")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,25 @@
 | 
				
			||||||
from typing import Optional, List, Dict, TYPE_CHECKING
 | 
					import hashlib
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import site
 | 
					import site
 | 
				
			||||||
import hashlib
 | 
					 | 
				
			||||||
import urllib.parse
 | 
					 | 
				
			||||||
import tarfile
 | 
					import tarfile
 | 
				
			||||||
 | 
					import urllib.parse
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import TYPE_CHECKING, Dict, List, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .._util import get_hash, get_checksum, upload_file, download_file
 | 
					 | 
				
			||||||
from .._util import ensure_pathy, make_tempdir
 | 
					 | 
				
			||||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
 | 
					 | 
				
			||||||
from ...git_info import GIT_VERSION
 | 
					 | 
				
			||||||
from ... import about
 | 
					from ... import about
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...git_info import GIT_VERSION
 | 
				
			||||||
 | 
					from ...util import ENV_VARS, check_bool_env_var, get_minor_version
 | 
				
			||||||
 | 
					from .._util import (
 | 
				
			||||||
 | 
					    download_file,
 | 
				
			||||||
 | 
					    ensure_pathy,
 | 
				
			||||||
 | 
					    get_checksum,
 | 
				
			||||||
 | 
					    get_hash,
 | 
				
			||||||
 | 
					    make_tempdir,
 | 
				
			||||||
 | 
					    upload_file,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if TYPE_CHECKING:
 | 
					if TYPE_CHECKING:
 | 
				
			||||||
    from pathy import FluidPath  # noqa: F401
 | 
					    from pathy import FluidPath  # noqa: F401
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,39 @@
 | 
				
			||||||
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
 | 
					 | 
				
			||||||
import os.path
 | 
					import os.path
 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
from wasabi.util import locale_escape
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					from wasabi.util import locale_escape
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ... import about
 | 
					from ... import about
 | 
				
			||||||
from ...git_info import GIT_VERSION
 | 
					from ...git_info import GIT_VERSION
 | 
				
			||||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
 | 
					from ...util import (
 | 
				
			||||||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
 | 
					    ENV_VARS,
 | 
				
			||||||
from ...util import check_bool_env_var, SimpleFrozenDict
 | 
					    SimpleFrozenDict,
 | 
				
			||||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
 | 
					    SimpleFrozenList,
 | 
				
			||||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
 | 
					    check_bool_env_var,
 | 
				
			||||||
 | 
					    is_cwd,
 | 
				
			||||||
 | 
					    is_minor_version_match,
 | 
				
			||||||
 | 
					    join_command,
 | 
				
			||||||
 | 
					    run_command,
 | 
				
			||||||
 | 
					    split_command,
 | 
				
			||||||
 | 
					    working_dir,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from .._util import (
 | 
				
			||||||
 | 
					    COMMAND,
 | 
				
			||||||
 | 
					    PROJECT_FILE,
 | 
				
			||||||
 | 
					    PROJECT_LOCK,
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    get_checksum,
 | 
				
			||||||
 | 
					    get_hash,
 | 
				
			||||||
 | 
					    load_project_config,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    project_cli,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@project_cli.command(
 | 
					@project_cli.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
 | 
				
			||||||
can help generate the best possible configuration, given a user's requirements. #}
 | 
					can help generate the best possible configuration, given a user's requirements. #}
 | 
				
			||||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 | 
					{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 | 
				
			||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 | 
					{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 | 
				
			||||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
 | 
					{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
 | 
				
			||||||
[paths]
 | 
					[paths]
 | 
				
			||||||
train = null
 | 
					train = null
 | 
				
			||||||
dev = null
 | 
					dev = null
 | 
				
			||||||
| 
						 | 
					@ -28,7 +28,7 @@ lang = "{{ lang }}"
 | 
				
			||||||
tok2vec/transformer. #}
 | 
					tok2vec/transformer. #}
 | 
				
			||||||
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
 | 
					{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
 | 
				
			||||||
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
 | 
					{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
 | 
				
			||||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
 | 
					{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
 | 
				
			||||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
 | 
					{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
 | 
				
			||||||
{%- else -%}
 | 
					{%- else -%}
 | 
				
			||||||
{%- set full_pipeline = components -%}
 | 
					{%- set full_pipeline = components -%}
 | 
				
			||||||
| 
						 | 
					@ -127,6 +127,30 @@ grad_factor = 1.0
 | 
				
			||||||
@layers = "reduce_mean.v1"
 | 
					@layers = "reduce_mean.v1"
 | 
				
			||||||
{% endif -%}
 | 
					{% endif -%}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{% if "span_finder" in components -%}
 | 
				
			||||||
 | 
					[components.span_finder]
 | 
				
			||||||
 | 
					factory = "span_finder"
 | 
				
			||||||
 | 
					max_length = null
 | 
				
			||||||
 | 
					min_length = null
 | 
				
			||||||
 | 
					scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
 | 
				
			||||||
 | 
					spans_key = "sc"
 | 
				
			||||||
 | 
					threshold = 0.5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model]
 | 
				
			||||||
 | 
					@architectures = "spacy.SpanFinder.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model.scorer]
 | 
				
			||||||
 | 
					@layers = "spacy.LinearLogistic.v1"
 | 
				
			||||||
 | 
					nO = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "spacy-transformers.TransformerListener.v1"
 | 
				
			||||||
 | 
					grad_factor = 1.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model.tok2vec.pooling]
 | 
				
			||||||
 | 
					@layers = "reduce_mean.v1"
 | 
				
			||||||
 | 
					{% endif -%}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{% if "spancat" in components -%}
 | 
					{% if "spancat" in components -%}
 | 
				
			||||||
[components.spancat]
 | 
					[components.spancat]
 | 
				
			||||||
factory = "spancat"
 | 
					factory = "spancat"
 | 
				
			||||||
| 
						 | 
					@ -392,6 +416,27 @@ nO = null
 | 
				
			||||||
width = ${components.tok2vec.model.encode.width}
 | 
					width = ${components.tok2vec.model.encode.width}
 | 
				
			||||||
{% endif %}
 | 
					{% endif %}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{% if "span_finder" in components %}
 | 
				
			||||||
 | 
					[components.span_finder]
 | 
				
			||||||
 | 
					factory = "span_finder"
 | 
				
			||||||
 | 
					max_length = null
 | 
				
			||||||
 | 
					min_length = null
 | 
				
			||||||
 | 
					scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
 | 
				
			||||||
 | 
					spans_key = "sc"
 | 
				
			||||||
 | 
					threshold = 0.5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model]
 | 
				
			||||||
 | 
					@architectures = "spacy.SpanFinder.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model.scorer]
 | 
				
			||||||
 | 
					@layers = "spacy.LinearLogistic.v1"
 | 
				
			||||||
 | 
					nO = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "spacy.Tok2VecListener.v1"
 | 
				
			||||||
 | 
					width = ${components.tok2vec.model.encode.width}
 | 
				
			||||||
 | 
					{% endif %}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{% if "spancat" in components %}
 | 
					{% if "spancat" in components %}
 | 
				
			||||||
[components.spancat]
 | 
					[components.spancat]
 | 
				
			||||||
factory = "spancat"
 | 
					factory = "spancat"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,23 @@
 | 
				
			||||||
from typing import Optional, Dict, Any, Union
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					 | 
				
			||||||
from ._util import import_code, setup_gpu
 | 
					 | 
				
			||||||
from ..training.loop import train as train_nlp
 | 
					 | 
				
			||||||
from ..training.initialize import init_nlp
 | 
					 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..training.initialize import init_nlp
 | 
				
			||||||
 | 
					from ..training.loop import train as train_nlp
 | 
				
			||||||
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    app,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    setup_gpu,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,21 @@
 | 
				
			||||||
from typing import Tuple
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import requests
 | 
					 | 
				
			||||||
from wasabi import msg, Printer
 | 
					 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					from wasabi import Printer, msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app
 | 
					 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
from ..util import get_package_version, get_installed_models, get_minor_version
 | 
					from ..util import (
 | 
				
			||||||
from ..util import get_package_path, get_model_meta, is_compatible_version
 | 
					    get_installed_models,
 | 
				
			||||||
 | 
					    get_minor_version,
 | 
				
			||||||
 | 
					    get_model_meta,
 | 
				
			||||||
 | 
					    get_package_path,
 | 
				
			||||||
 | 
					    get_package_version,
 | 
				
			||||||
 | 
					    is_compatible_version,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from ._util import app
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("validate")
 | 
					@app.command("validate")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,6 @@
 | 
				
			||||||
"""Helpers for Python and platform compatibility."""
 | 
					"""Helpers for Python and platform compatibility."""
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.util import copy_array
 | 
					from thinc.util import copy_array
 | 
				
			||||||
 | 
					
 | 
				
			||||||
try:
 | 
					try:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
 | 
				
			||||||
DOCS: https://spacy.io/api/top-level#displacy
 | 
					DOCS: https://spacy.io/api/top-level#displacy
 | 
				
			||||||
USAGE: https://spacy.io/usage/visualizers
 | 
					USAGE: https://spacy.io/usage/visualizers
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
 | 
					 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					from typing import Any, Callable, Dict, Iterable, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
 | 
					 | 
				
			||||||
from ..tokens import Doc, Span
 | 
					 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..util import is_in_jupyter
 | 
					from ..tokens import Doc, Span
 | 
				
			||||||
from ..util import find_available_port
 | 
					from ..util import find_available_port, is_in_jupyter
 | 
				
			||||||
 | 
					from .render import DependencyRenderer, EntityRenderer, SpanRenderer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_html = {}
 | 
					_html = {}
 | 
				
			||||||
RENDER_WRAPPER = None
 | 
					RENDER_WRAPPER = None
 | 
				
			||||||
| 
						 | 
					@ -68,7 +66,7 @@ def render(
 | 
				
			||||||
    if jupyter or (jupyter is None and is_in_jupyter()):
 | 
					    if jupyter or (jupyter is None and is_in_jupyter()):
 | 
				
			||||||
        # return HTML rendered by IPython display()
 | 
					        # return HTML rendered by IPython display()
 | 
				
			||||||
        # See #4840 for details on span wrapper to disable mathjax
 | 
					        # See #4840 for details on span wrapper to disable mathjax
 | 
				
			||||||
        from IPython.core.display import display, HTML
 | 
					        from IPython.core.display import HTML, display
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
 | 
					        return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
 | 
				
			||||||
    return html
 | 
					    return html
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,29 @@
 | 
				
			||||||
from typing import Any, Dict, List, Optional, Tuple, Union
 | 
					 | 
				
			||||||
import uuid
 | 
					 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
 | 
					import uuid
 | 
				
			||||||
 | 
					from typing import Any, Dict, List, Optional, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..util import escape_html, minify_html, registry
 | 
					from ..util import escape_html, minify_html, registry
 | 
				
			||||||
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
 | 
					from .templates import (
 | 
				
			||||||
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
 | 
					    TPL_DEP_ARCS,
 | 
				
			||||||
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
 | 
					    TPL_DEP_SVG,
 | 
				
			||||||
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
 | 
					    TPL_DEP_WORDS,
 | 
				
			||||||
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
 | 
					    TPL_DEP_WORDS_LEMMA,
 | 
				
			||||||
from .templates import TPL_TITLE
 | 
					    TPL_ENT,
 | 
				
			||||||
 | 
					    TPL_ENT_RTL,
 | 
				
			||||||
 | 
					    TPL_ENTS,
 | 
				
			||||||
 | 
					    TPL_FIGURE,
 | 
				
			||||||
 | 
					    TPL_KB_LINK,
 | 
				
			||||||
 | 
					    TPL_PAGE,
 | 
				
			||||||
 | 
					    TPL_SPAN,
 | 
				
			||||||
 | 
					    TPL_SPAN_RTL,
 | 
				
			||||||
 | 
					    TPL_SPAN_SLICE,
 | 
				
			||||||
 | 
					    TPL_SPAN_SLICE_RTL,
 | 
				
			||||||
 | 
					    TPL_SPAN_START,
 | 
				
			||||||
 | 
					    TPL_SPAN_START_RTL,
 | 
				
			||||||
 | 
					    TPL_SPANS,
 | 
				
			||||||
 | 
					    TPL_TITLE,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_LANG = "en"
 | 
					DEFAULT_LANG = "en"
 | 
				
			||||||
DEFAULT_DIR = "ltr"
 | 
					DEFAULT_DIR = "ltr"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .compat import Literal
 | 
					from .compat import Literal
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -215,6 +216,9 @@ class Warnings(metaclass=ErrorsWithCodes):
 | 
				
			||||||
    W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
 | 
					    W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
 | 
				
			||||||
            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
 | 
					            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
 | 
				
			||||||
    W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
 | 
					    W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
 | 
				
			||||||
 | 
					    W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
 | 
				
			||||||
 | 
					            "key attribute for vectors, configure it through Vectors(attr=) or "
 | 
				
			||||||
 | 
					            "'spacy init vectors --attr'")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Errors(metaclass=ErrorsWithCodes):
 | 
					class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
| 
						 | 
					@ -738,8 +742,8 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
 | 
					            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
 | 
				
			||||||
            "load the model, use its full name instead:\n\n"
 | 
					            "load the model, use its full name instead:\n\n"
 | 
				
			||||||
            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
 | 
					            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
 | 
				
			||||||
            "models, see the models directory: https://spacy.io/models. If you "
 | 
					            "models, see the models directory: https://spacy.io/models and if "
 | 
				
			||||||
            "want to create a blank model, use spacy.blank: "
 | 
					            "you want to create a blank model, use spacy.blank: "
 | 
				
			||||||
            "nlp = spacy.blank(\"{name}\")")
 | 
					            "nlp = spacy.blank(\"{name}\")")
 | 
				
			||||||
    E942 = ("Executing `after_{name}` callback failed. Expected the function to "
 | 
					    E942 = ("Executing `after_{name}` callback failed. Expected the function to "
 | 
				
			||||||
            "return an initialized nlp object but got: {value}. Maybe "
 | 
					            "return an initialized nlp object but got: {value}. Maybe "
 | 
				
			||||||
| 
						 | 
					@ -970,6 +974,13 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
 | 
					    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
 | 
				
			||||||
             "or use `auto_select_port=True` to pick an available port automatically.")
 | 
					             "or use `auto_select_port=True` to pick an available port automatically.")
 | 
				
			||||||
    E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
 | 
					    E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
 | 
				
			||||||
 | 
					    E1052 = ("Unable to copy spans: the character offsets for the span at "
 | 
				
			||||||
 | 
					             "index {i} in the span group do not align with the tokenization "
 | 
				
			||||||
 | 
					             "in the target doc.")
 | 
				
			||||||
 | 
					    E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
 | 
				
			||||||
 | 
					             " 'min_length': {min_length}, 'max_length': {max_length}")
 | 
				
			||||||
 | 
					    E1054 = ("The text, including whitespace, must match between reference and "
 | 
				
			||||||
 | 
					             "predicted docs when training {component}.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Deprecated model shortcuts, only used in errors and warnings
 | 
					# Deprecated model shortcuts, only used in errors and warnings
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .errors import Warnings
 | 
					from .errors import Warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,3 @@
 | 
				
			||||||
 | 
					from .candidate import Candidate, get_candidates, get_candidates_batch
 | 
				
			||||||
from .kb import KnowledgeBase
 | 
					from .kb import KnowledgeBase
 | 
				
			||||||
from .kb_in_memory import InMemoryLookupKB
 | 
					from .kb_in_memory import InMemoryLookupKB
 | 
				
			||||||
from .candidate import Candidate, get_candidates, get_candidates_batch
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,8 @@
 | 
				
			||||||
from .kb cimport KnowledgeBase
 | 
					 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..typedefs cimport hash_t
 | 
					from ..typedefs cimport hash_t
 | 
				
			||||||
 | 
					from .kb cimport KnowledgeBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 | 
					# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 | 
				
			||||||
cdef class Candidate:
 | 
					cdef class Candidate:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,12 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True, profile=True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from typing import Iterable
 | 
					from typing import Iterable
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .kb cimport KnowledgeBase
 | 
					from .kb cimport KnowledgeBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokens import Span
 | 
					from ..tokens import Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Candidate:
 | 
					cdef class Candidate:
 | 
				
			||||||
    """A `Candidate` object refers to a textual mention (`alias`) that may or
 | 
					    """A `Candidate` object refers to a textual mention (`alias`) that may or
 | 
				
			||||||
    may not be resolved to a specific `entity` from a Knowledge Base. This
 | 
					    may not be resolved to a specific `entity` from a Knowledge Base. This
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,8 +2,10 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from libc.stdint cimport int64_t
 | 
					from libc.stdint cimport int64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class KnowledgeBase:
 | 
					cdef class KnowledgeBase:
 | 
				
			||||||
    cdef Pool mem
 | 
					    cdef Pool mem
 | 
				
			||||||
    cdef readonly Vocab vocab
 | 
					    cdef readonly Vocab vocab
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,12 +2,13 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Iterable, Tuple, Union
 | 
					from typing import Iterable, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .candidate import Candidate
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..tokens import Span
 | 
					from ..tokens import Span
 | 
				
			||||||
from ..util import SimpleFrozenList
 | 
					from ..util import SimpleFrozenList
 | 
				
			||||||
from ..errors import Errors
 | 
					from .candidate import Candidate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class KnowledgeBase:
 | 
					cdef class KnowledgeBase:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,11 @@
 | 
				
			||||||
"""Knowledge-base for entity or concept linking."""
 | 
					"""Knowledge-base for entity or concept linking."""
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					 | 
				
			||||||
from libc.stdint cimport int32_t, int64_t
 | 
					from libc.stdint cimport int32_t, int64_t
 | 
				
			||||||
from libc.stdio cimport FILE
 | 
					from libc.stdio cimport FILE
 | 
				
			||||||
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..structs cimport AliasC, KBEntryC
 | 
				
			||||||
from ..typedefs cimport hash_t
 | 
					from ..typedefs cimport hash_t
 | 
				
			||||||
from ..structs cimport KBEntryC, AliasC
 | 
					 | 
				
			||||||
from .kb cimport KnowledgeBase
 | 
					from .kb cimport KnowledgeBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ctypedef vector[KBEntryC] entry_vec
 | 
					ctypedef vector[KBEntryC] entry_vec
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,23 +1,28 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True, profile=True
 | 
				
			||||||
from typing import Iterable, Callable, Dict, Any, Union
 | 
					from typing import Any, Callable, Dict, Iterable, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					 | 
				
			||||||
from cpython.exc cimport PyErr_SetFromErrno
 | 
					 | 
				
			||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
 | 
					 | 
				
			||||||
from libc.stdint cimport int32_t, int64_t
 | 
					 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from pathlib import Path
 | 
					from cpython.exc cimport PyErr_SetFromErrno
 | 
				
			||||||
 | 
					from libc.stdint cimport int32_t, int64_t
 | 
				
			||||||
 | 
					from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
 | 
				
			||||||
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokens import Span
 | 
					from ..tokens import Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..typedefs cimport hash_t
 | 
					from ..typedefs cimport hash_t
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..util import SimpleFrozenList, ensure_path
 | 
					from ..util import SimpleFrozenList, ensure_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
from .kb cimport KnowledgeBase
 | 
					from .kb cimport KnowledgeBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .candidate import Candidate as Candidate
 | 
					from .candidate import Candidate as Candidate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AfrikaansDefaults(BaseDefaults):
 | 
					class AfrikaansDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,11 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from ...attrs import LANG
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
from ...attrs import LANG
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AmharicDefaults(BaseDefaults):
 | 
					class AmharicDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,11 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import UNITS, ALPHA_UPPER
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    UNITS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
 | 
					_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,4 @@
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,8 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ArabicDefaults(BaseDefaults):
 | 
					class ArabicDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,11 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import UNITS, ALPHA_UPPER
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    UNITS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_suffixes = (
 | 
					_suffixes = (
 | 
				
			||||||
    LIST_PUNCT
 | 
					    LIST_PUNCT
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AzerbaijaniDefaults(BaseDefaults):
 | 
					class AzerbaijaniDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
# Eleven, twelve etc. are written separate: on bir, on iki
 | 
					# Eleven, twelve etc. are written separate: on bir, on iki
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_num_words = [
 | 
					_num_words = [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,14 @@
 | 
				
			||||||
 | 
					from ...attrs import LANG
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..punctuation import (
 | 
				
			||||||
 | 
					    COMBINING_DIACRITICS_TOKENIZER_INFIXES,
 | 
				
			||||||
 | 
					    COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
 | 
					 | 
				
			||||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
 | 
					 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
from ...attrs import LANG
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BulgarianDefaults(BaseDefaults):
 | 
					class BulgarianDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
_num_words = [
 | 
					_num_words = [
 | 
				
			||||||
    "нула",
 | 
					    "нула",
 | 
				
			||||||
    "едно",
 | 
					    "едно",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,8 +4,7 @@ References:
 | 
				
			||||||
    (countries, occupations, fields of studies and more).
 | 
					    (countries, occupations, fields of studies and more).
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,10 +1,12 @@
 | 
				
			||||||
from typing import Optional, Callable
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
from ...pipeline import Lemmatizer
 | 
					from ...pipeline import Lemmatizer
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BengaliDefaults(BaseDefaults):
 | 
					class BengaliDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,14 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
 | 
					    ALPHA,
 | 
				
			||||||
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    HYPHENS,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    UNITS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_currency = r"\$¢£€¥฿৳"
 | 
					_currency = r"\$¢£€¥฿৳"
 | 
				
			||||||
_quotes = CONCAT_QUOTES.replace("'", "")
 | 
					_quotes = CONCAT_QUOTES.replace("'", "")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,14 @@
 | 
				
			||||||
from typing import Optional, Callable
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
from .lemmatizer import CatalanLemmatizer
 | 
					from .lemmatizer import CatalanLemmatizer
 | 
				
			||||||
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CatalanDefaults(BaseDefaults):
 | 
					class CatalanDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
_num_words = [
 | 
					_num_words = [
 | 
				
			||||||
    "zero",
 | 
					    "zero",
 | 
				
			||||||
    "un",
 | 
					    "un",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,18 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import LIST_CURRENCY
 | 
					    ALPHA,
 | 
				
			||||||
from ..char_classes import CURRENCY
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
from ..char_classes import merge_chars, _units
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    PUNCT,
 | 
				
			||||||
 | 
					    _units,
 | 
				
			||||||
 | 
					    merge_chars,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 | 
					ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,8 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PROPN
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CzechDefaults(BaseDefaults):
 | 
					class CzechDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DanishDefaults(BaseDefaults):
 | 
					class DanishDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
# Source http://fjern-uv.dk/tal.php
 | 
					# Source http://fjern-uv.dk/tal.php
 | 
				
			||||||
_num_words = """nul
 | 
					_num_words = """nul
 | 
				
			||||||
en et to tre fire fem seks syv otte ni ti
 | 
					en et to tre fire fem seks syv otte ni ti
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,13 @@
 | 
				
			||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 | 
					    ALPHA,
 | 
				
			||||||
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
from ..punctuation import TOKENIZER_SUFFIXES
 | 
					from ..punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
_quotes = CONCAT_QUOTES.replace("'", "")
 | 
					_quotes = CONCAT_QUOTES.replace("'", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_infixes = (
 | 
					_infixes = (
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,8 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import AUX, NOUN, PRON, PROPN, VERB
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,10 +2,9 @@
 | 
				
			||||||
Tokenizer Exceptions.
 | 
					Tokenizer Exceptions.
 | 
				
			||||||
Source: https://forkortelse.dk/ and various others.
 | 
					Source: https://forkortelse.dk/ and various others.
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,8 @@
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GermanDefaults(BaseDefaults):
 | 
					class GermanDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,18 @@
 | 
				
			||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import CURRENCY, UNITS, PUNCT
 | 
					    ALPHA,
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    PUNCT,
 | 
				
			||||||
 | 
					    UNITS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 | 
					from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
 | 
					_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_suffixes = (
 | 
					_suffixes = (
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PRON, PROPN
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {
 | 
					_exc = {
 | 
				
			||||||
    "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
 | 
					    "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LowerSorbianDefaults(BaseDefaults):
 | 
					class LowerSorbianDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,14 @@
 | 
				
			||||||
from typing import Optional, Callable
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					 | 
				
			||||||
from .lemmatizer import GreekLemmatizer
 | 
					from .lemmatizer import GreekLemmatizer
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GreekDefaults(BaseDefaults):
 | 
					class GreekDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,6 @@
 | 
				
			||||||
def get_pos_from_wiktionary():
 | 
					def get_pos_from_wiktionary():
 | 
				
			||||||
    import re
 | 
					    import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    from gensim.corpora.wikicorpus import extract_pages
 | 
					    from gensim.corpora.wikicorpus import extract_pages
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    regex = re.compile(r"==={{(\w+)\|el}}===")
 | 
					    regex = re.compile(r"==={{(\w+)\|el}}===")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,16 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
 | 
					    ALPHA,
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, CURRENCY
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    HYPHENS,
 | 
				
			||||||
 | 
					    LIST_CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_units = (
 | 
					_units = (
 | 
				
			||||||
    "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
 | 
					    "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PRON, PROPN
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,14 @@
 | 
				
			||||||
from typing import Optional, Callable
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					 | 
				
			||||||
from .lemmatizer import EnglishLemmatizer
 | 
					from .lemmatizer import EnglishLemmatizer
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class EnglishDefaults(BaseDefaults):
 | 
					class EnglishDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,12 @@
 | 
				
			||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 | 
					    ALPHA,
 | 
				
			||||||
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    HYPHENS,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_infixes = (
 | 
					_infixes = (
 | 
				
			||||||
    LIST_ELLIPSES
 | 
					    LIST_ELLIPSES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PRON, PROPN
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,8 @@
 | 
				
			||||||
from typing import Dict, List
 | 
					from typing import Dict, List
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc: Dict[str, List[Dict]] = {}
 | 
					_exc: Dict[str, List[Dict]] = {}
 | 
				
			||||||
_exclude = [
 | 
					_exclude = [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,14 @@
 | 
				
			||||||
from typing import Optional, Callable
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from .lemmatizer import SpanishLemmatizer
 | 
					from .lemmatizer import SpanishLemmatizer
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SpanishDefaults(BaseDefaults):
 | 
					class SpanishDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
from typing import List, Optional, Tuple
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					from typing import List, Optional, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...pipeline import Lemmatizer
 | 
					from ...pipeline import Lemmatizer
 | 
				
			||||||
from ...tokens import Token
 | 
					from ...tokens import Token
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
_num_words = [
 | 
					_num_words = [
 | 
				
			||||||
    "cero",
 | 
					    "cero",
 | 
				
			||||||
    "uno",
 | 
					    "uno",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,17 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 | 
					    ALPHA,
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
from ..char_classes import merge_chars
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    LIST_UNITS,
 | 
				
			||||||
 | 
					    PUNCT,
 | 
				
			||||||
 | 
					    merge_chars,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_list_units = [u for u in LIST_UNITS if u != "%"]
 | 
					_list_units = [u for u in LIST_UNITS if u != "%"]
 | 
				
			||||||
_units = merge_chars(" ".join(_list_units))
 | 
					_units = merge_chars(" ".join(_list_units))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PRON, PROPN
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {
 | 
					_exc = {
 | 
				
			||||||
    "pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
 | 
					    "pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class EstonianDefaults(BaseDefaults):
 | 
					class EstonianDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BasqueDefaults(BaseDefaults):
 | 
					class BasqueDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
from ..punctuation import TOKENIZER_SUFFIXES
 | 
					from ..punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
_suffixes = TOKENIZER_SUFFIXES
 | 
					_suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,14 @@
 | 
				
			||||||
from typing import Optional, Callable
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
from ...pipeline import Lemmatizer
 | 
					from ...pipeline import Lemmatizer
 | 
				
			||||||
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PersianDefaults(BaseDefaults):
 | 
					class PersianDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
MIM = "م"
 | 
					MIM = "م"
 | 
				
			||||||
ZWNJ_O_MIM = "ام"
 | 
					ZWNJ_O_MIM = "ام"
 | 
				
			||||||
YE_NUN = "ین"
 | 
					YE_NUN = "ین"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,11 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import UNITS, ALPHA_UPPER
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    UNITS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_suffixes = (
 | 
					_suffixes = (
 | 
				
			||||||
    LIST_PUNCT
 | 
					    LIST_PUNCT
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user