mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge branch 'develop' into feature/doc-ents-v3-2
This commit is contained in:
commit
535842e483
|
@ -224,7 +224,7 @@ for that particular code. Here's an example:
|
|||
```python
|
||||
# fmt: off
|
||||
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
||||
heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
|
||||
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||
"poss", "nsubj", "ccomp", "punct"]
|
||||
|
@ -421,7 +421,7 @@ Tests that require the model to be loaded should be marked with
|
|||
`@pytest.mark.models`. Loading the models is expensive and not necessary if
|
||||
you're not actually testing the model performance. If all you need is a `Doc`
|
||||
object with annotations like heads, POS tags or the dependency parse, you can
|
||||
use the `get_doc()` utility function to construct it manually.
|
||||
use the `Doc` constructor to construct it manually.
|
||||
|
||||
📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a19"
|
||||
__version__ = "3.0.0a21"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -7,13 +7,15 @@ import srsly
|
|||
import hashlib
|
||||
import typer
|
||||
from click import NoSuchOption
|
||||
from click.parser import split_arg_string
|
||||
from typer.main import get_command
|
||||
from contextlib import contextmanager
|
||||
from thinc.config import Config, ConfigValidationError
|
||||
from configparser import InterpolationError
|
||||
import os
|
||||
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
|
@ -37,6 +39,7 @@ commands to check and validate your config files, training and evaluation data,
|
|||
and custom model implementations.
|
||||
"""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
|
@ -61,24 +64,41 @@ def setup_cli() -> None:
|
|||
command(prog_name=COMMAND)
|
||||
|
||||
|
||||
def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
||||
def parse_config_overrides(
|
||||
args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate a dictionary of config overrides based on the extra arguments
|
||||
provided on the CLI, e.g. --training.batch_size to override
|
||||
"training.batch_size". Arguments without a "." are considered invalid,
|
||||
since the config only allows top-level sections to exist.
|
||||
|
||||
args (List[str]): The extra arguments from the command line.
|
||||
env_vars (Optional[str]): Optional environment variable to read from.
|
||||
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
|
||||
"""
|
||||
env_string = os.environ.get(env_var, "") if env_var else ""
|
||||
env_overrides = _parse_overrides(split_arg_string(env_string))
|
||||
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||
if cli_overrides:
|
||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||
logger.debug(f"Config overrides from CLI: {keys}")
|
||||
if env_overrides:
|
||||
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
||||
return {**cli_overrides, **env_overrides}
|
||||
|
||||
|
||||
def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
|
||||
result = {}
|
||||
while args:
|
||||
opt = args.pop(0)
|
||||
err = f"Invalid CLI argument '{opt}'"
|
||||
err = f"Invalid config override '{opt}'"
|
||||
if opt.startswith("--"): # new argument
|
||||
orig_opt = opt
|
||||
opt = opt.replace("--", "")
|
||||
if "." not in opt:
|
||||
raise NoSuchOption(orig_opt)
|
||||
if is_cli:
|
||||
raise NoSuchOption(orig_opt)
|
||||
else:
|
||||
msg.fail(f"{err}: can't override top-level sections", exits=1)
|
||||
if "=" in opt: # we have --opt=value
|
||||
opt, value = opt.split("=", 1)
|
||||
opt = opt.replace("-", "_")
|
||||
|
@ -97,7 +117,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
|||
except ValueError:
|
||||
result[opt] = str(value)
|
||||
else:
|
||||
msg.fail(f"{err}: override option should start with --", exits=1)
|
||||
msg.fail(f"{err}: name should start with --", exits=1)
|
||||
return result
|
||||
|
||||
|
||||
|
@ -286,7 +306,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
|||
if dest.exists() and not force:
|
||||
return None
|
||||
src = str(src)
|
||||
with smart_open.open(src, mode="rb") as input_file:
|
||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List
|
|||
from pathlib import Path
|
||||
from wasabi import msg, table
|
||||
from thinc.api import Config
|
||||
from thinc.config import VARIABLE_RE
|
||||
from thinc.config import VARIABLE_RE, ConfigValidationError
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||
|
@ -51,7 +51,10 @@ def debug_config(
|
|||
msg.divider("Config validation")
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
nlp, _ = util.load_model_from_config(config)
|
||||
nlp, resolved = util.load_model_from_config(config)
|
||||
# Use the resolved config here in case user has one function returning
|
||||
# a dict of corpora etc.
|
||||
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
|
||||
msg.good("Config is valid")
|
||||
if show_vars:
|
||||
variables = get_variables(config)
|
||||
|
@ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]:
|
|||
value = util.dot_to_object(config, path)
|
||||
result[variable] = repr(value)
|
||||
return result
|
||||
|
||||
|
||||
def check_section_refs(config: Config, fields: List[str]) -> None:
|
||||
"""Validate fields in the config that refer to other sections or values
|
||||
(e.g. in the corpora) and make sure that those references exist.
|
||||
"""
|
||||
errors = []
|
||||
for field in fields:
|
||||
# If the field doesn't exist in the config, we ignore it
|
||||
try:
|
||||
value = util.dot_to_object(config, field)
|
||||
except KeyError:
|
||||
continue
|
||||
try:
|
||||
util.dot_to_object(config, value)
|
||||
except KeyError:
|
||||
msg = f"not a valid section reference: {value}"
|
||||
errors.append({"loc": field.split("."), "msg": msg})
|
||||
if errors:
|
||||
raise ConfigValidationError(config, errors)
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
from typing import Dict, Any, Optional
|
||||
import warnings
|
||||
from typing import Dict, Any, Optional, Iterable
|
||||
from pathlib import Path
|
||||
|
||||
from spacy.training import Example
|
||||
from spacy.util import dot_to_object
|
||||
from wasabi import msg
|
||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||
|
@ -59,23 +63,24 @@ def debug_model_cli(
|
|||
allocator = config["training"]["gpu_allocator"]
|
||||
if use_gpu >= 0 and allocator:
|
||||
set_gpu_allocator(allocator)
|
||||
nlp, config = util.load_model_from_config(config_path)
|
||||
nlp, config = util.load_model_from_config(config)
|
||||
seed = config["training"]["seed"]
|
||||
if seed is not None:
|
||||
msg.info(f"Fixing random seed: {seed}")
|
||||
fix_random_seed(seed)
|
||||
pipe = nlp.get_pipe(component)
|
||||
if hasattr(pipe, "model"):
|
||||
model = pipe.model
|
||||
else:
|
||||
if not hasattr(pipe, "model"):
|
||||
msg.fail(
|
||||
f"The component '{component}' does not specify an object that holds a Model.",
|
||||
exits=1,
|
||||
)
|
||||
debug_model(model, print_settings=print_settings)
|
||||
model = pipe.model
|
||||
debug_model(config, nlp, model, print_settings=print_settings)
|
||||
|
||||
|
||||
def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
|
||||
def debug_model(
|
||||
config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
if not isinstance(model, Model):
|
||||
msg.fail(
|
||||
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
|
||||
|
@ -92,10 +97,23 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
|||
|
||||
# STEP 1: Initializing the model and printing again
|
||||
X = _get_docs()
|
||||
Y = _get_output(model.ops.xp)
|
||||
# The output vector might differ from the official type of the output layer
|
||||
with data_validation(False):
|
||||
model.initialize(X=X, Y=Y)
|
||||
try:
|
||||
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
|
||||
nlp.begin_training(lambda: train_corpus(nlp))
|
||||
msg.info("Initialized the model with the training corpus.")
|
||||
except ValueError:
|
||||
try:
|
||||
_set_output_dim(nO=7, model=model)
|
||||
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
|
||||
msg.info("Initialized the model with dummy data.")
|
||||
except:
|
||||
msg.fail(
|
||||
"Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
if print_settings.get("print_after_init"):
|
||||
msg.divider(f"STEP 1 - after initialization")
|
||||
_print_model(model, print_settings)
|
||||
|
@ -103,9 +121,18 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
|||
# STEP 2: Updating the model and printing again
|
||||
optimizer = Adam(0.001)
|
||||
set_dropout_rate(model, 0.2)
|
||||
# ugly hack to deal with Tok2Vec listeners
|
||||
tok2vec = None
|
||||
if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
|
||||
tok2vec = nlp.get_pipe("tok2vec")
|
||||
goldY = None
|
||||
for e in range(3):
|
||||
Y, get_dX = model.begin_update(_get_docs())
|
||||
dY = get_gradient(model, Y)
|
||||
if tok2vec:
|
||||
tok2vec.predict(X)
|
||||
Y, get_dX = model.begin_update(X)
|
||||
if goldY is None:
|
||||
goldY = _simulate_gold(Y)
|
||||
dY = get_gradient(goldY, Y, model.ops)
|
||||
get_dX(dY)
|
||||
model.finish_update(optimizer)
|
||||
if print_settings.get("print_after_training"):
|
||||
|
@ -113,15 +140,25 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
|
|||
_print_model(model, print_settings)
|
||||
|
||||
# STEP 3: the final prediction
|
||||
prediction = model.predict(_get_docs())
|
||||
prediction = model.predict(X)
|
||||
if print_settings.get("print_prediction"):
|
||||
msg.divider(f"STEP 3 - prediction")
|
||||
msg.info(str(prediction))
|
||||
|
||||
msg.good(f"Succesfully ended analysis - model looks good.")
|
||||
|
||||
def get_gradient(model, Y):
|
||||
goldY = _get_output(model.ops.xp)
|
||||
return Y - goldY
|
||||
|
||||
def get_gradient(goldY, Y, ops):
|
||||
return ops.asarray(Y) - ops.asarray(goldY)
|
||||
|
||||
|
||||
def _simulate_gold(element, counter=1):
|
||||
if isinstance(element, Iterable):
|
||||
for i in range(len(element)):
|
||||
element[i] = _simulate_gold(element[i], counter + i)
|
||||
return element
|
||||
else:
|
||||
return 1 / counter
|
||||
|
||||
|
||||
def _sentences():
|
||||
|
@ -138,8 +175,13 @@ def _get_docs(lang: str = "en"):
|
|||
return list(nlp.pipe(_sentences()))
|
||||
|
||||
|
||||
def _get_output(xp):
|
||||
return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
|
||||
def _set_output_dim(model, nO):
|
||||
# simulating dim inference by directly setting the nO argument of the model
|
||||
if model.has_dim("nO") is None:
|
||||
model.set_dim("nO", nO)
|
||||
if model.has_ref("output_layer"):
|
||||
if model.get_ref("output_layer").has_dim("nO") is None:
|
||||
model.get_ref("output_layer").set_dim("nO", nO)
|
||||
|
||||
|
||||
def _print_model(model, print_settings):
|
||||
|
|
|
@ -66,6 +66,7 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
|
|||
branch=asset["git"].get("branch"),
|
||||
sparse=sparse_checkout,
|
||||
)
|
||||
msg.good(f"Downloaded asset {dest}")
|
||||
else:
|
||||
url = asset.get("url")
|
||||
if not url:
|
||||
|
|
|
@ -59,7 +59,7 @@ def project_run(
|
|||
for dep in cmd.get("deps", []):
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command?"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, err_help, **err_kwargs)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
|
|
|
@ -57,7 +57,10 @@ class Warnings:
|
|||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||
"the Knowledge Base.")
|
||||
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
||||
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||
"you are constructing a parse tree incrementally by setting "
|
||||
"token.head values, you can probably ignore this warning. Consider "
|
||||
"using Doc(words, ..., heads=heads, deps=deps) instead.")
|
||||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||
"be more efficient to split your training data into multiple "
|
||||
"smaller JSON files instead.")
|
||||
|
@ -452,7 +455,7 @@ class Errors:
|
|||
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
||||
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
|
||||
E187 = ("Only unicode strings are supported as labels.")
|
||||
E189 = ("Each argument to `get_doc` should be of equal length.")
|
||||
E189 = ("Each argument to Doc.__init__ should be of equal length.")
|
||||
E190 = ("Token head out of range in `Doc.from_array()` for token index "
|
||||
"'{index}' with value '{value}' (equivalent to relative head "
|
||||
"index: '{rel_head_index}'). The head indices should be relative "
|
||||
|
@ -542,7 +545,8 @@ class Errors:
|
|||
E949 = ("Can only create an alignment when the texts are the same.")
|
||||
E952 = ("The section '{name}' is not a valid section in the provided config.")
|
||||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||
E954 = ("The Tok2Vec listener did not receive a valid input.")
|
||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||
"component.")
|
||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
|
||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||
"Available components: {opts}")
|
||||
|
|
|
@ -17,7 +17,6 @@ Tests for spaCy modules and classes live in their own directories of the same na
|
|||
5. [Helpers and utilities](#helpers-and-utilities)
|
||||
6. [Contributing to the tests](#contributing-to-the-tests)
|
||||
|
||||
|
||||
## Running the tests
|
||||
|
||||
To show print statements, run the tests with `py.test -s`. To abort after the
|
||||
|
@ -39,19 +38,17 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
|
|||
|
||||
## Dos and don'ts
|
||||
|
||||
To keep the behaviour of the tests consistent and predictable, we try to follow a few basic conventions:
|
||||
|
||||
* **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
|
||||
* If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
|
||||
* Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
|
||||
* Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
|
||||
* If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
|
||||
* Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
|
||||
* **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
|
||||
* If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
|
||||
* Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
|
||||
* Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
|
||||
To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
|
||||
|
||||
- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
|
||||
- If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
|
||||
- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
|
||||
- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
|
||||
- If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
|
||||
- Before requiring the models, always make sure there is no other way to test the particular behavior. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
|
||||
- **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
|
||||
- If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
|
||||
- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behavior at a time.
|
||||
|
||||
## Parameters
|
||||
|
||||
|
@ -64,7 +61,7 @@ def test_tokenizer_keep_urls(tokenizer, text):
|
|||
assert len(tokens) == 1
|
||||
```
|
||||
|
||||
This will run the test once for each `text` value. Even if you're only testing one example, it's usually best to specify it as a parameter. This will later make it easier for others to quickly add additional test cases without having to modify the test.
|
||||
This will run the test once for each `text` value. Even if you're only testing one example, it's usually best to specify it as a parameter. This will later make it easier for others to quickly add additional test cases without having to modify the test.
|
||||
|
||||
You can also specify parameters as tuples to test with multiple values per test:
|
||||
|
||||
|
@ -79,8 +76,7 @@ To test for combinations of parameters, you can add several `parametrize` marker
|
|||
@pytest.mark.parametrize('punct', ['.', '!', '?'])
|
||||
```
|
||||
|
||||
This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unneccessary or undesired test bloat.
|
||||
|
||||
This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unnecessary or undesired test bloat.
|
||||
|
||||
## Fixtures
|
||||
|
||||
|
@ -88,11 +84,11 @@ Fixtures to create instances of spaCy objects and other components should only b
|
|||
|
||||
These are the main fixtures that are currently available:
|
||||
|
||||
| Fixture | Description |
|
||||
| --- | --- |
|
||||
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. |
|
||||
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
|
||||
| `en_vocab` | Creates an instance of the English `Vocab`. |
|
||||
| Fixture | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------- |
|
||||
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. |
|
||||
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
|
||||
| `en_vocab` | Creates an instance of the English `Vocab`. |
|
||||
|
||||
The fixtures can be used in all tests by simply setting them as an argument, like this:
|
||||
|
||||
|
@ -107,59 +103,32 @@ If all tests in a file require a specific configuration, or use the same complex
|
|||
|
||||
Our new test setup comes with a few handy utility functions that can be imported from [`util.py`](util.py).
|
||||
|
||||
### Constructing a `Doc` object manually
|
||||
|
||||
### Constructing a `Doc` object manually with `get_doc()`
|
||||
|
||||
Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can use `get_doc()` to construct it manually.
|
||||
Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need is a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
|
||||
|
||||
```python
|
||||
def test_doc_token_api_strings(en_tokenizer):
|
||||
def test_doc_token_api_strings(en_vocab):
|
||||
text = "Give it back! He pleaded."
|
||||
pos = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT']
|
||||
heads = [0, -1, -2, -3, 1, 0, -1]
|
||||
heads = [0, 0, 0, 0, 5, 5, 5]
|
||||
deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct']
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
|
||||
assert doc[0].text == 'Give'
|
||||
assert doc[0].lower_ == 'give'
|
||||
assert doc[0].pos_ == 'VERB'
|
||||
assert doc[0].dep_ == 'ROOT'
|
||||
```
|
||||
|
||||
You can construct a `Doc` with the following arguments:
|
||||
|
||||
| Argument | Description |
|
||||
| --- | --- |
|
||||
| `vocab` | `Vocab` instance to use. If you're tokenizing before creating a `Doc`, make sure to use the tokenizer's vocab. Otherwise, you can also use the `en_vocab` fixture. **(required)** |
|
||||
| `words` | List of words, for example `[t.text for t in tokens]`. **(required)** |
|
||||
| `heads` | List of heads as integers. |
|
||||
| `pos` | List of POS tags as text values. |
|
||||
| `tag` | List of tag names as text values. |
|
||||
| `dep` | List of dependencies as text values. |
|
||||
| `ents` | List of entity tuples with `start`, `end`, `label` (for example `(0, 2, 'PERSON')`). The `label` will be looked up in `vocab.strings[label]`. |
|
||||
|
||||
Here's how to quickly get these values from within spaCy:
|
||||
|
||||
```python
|
||||
doc = nlp(u'Some text here')
|
||||
print([token.head.i-token.i for token in doc])
|
||||
print([token.tag_ for token in doc])
|
||||
print([token.pos_ for token in doc])
|
||||
print([token.dep_ for token in doc])
|
||||
print([(ent.start, ent.end, ent.label_) for ent in doc.ents])
|
||||
```
|
||||
|
||||
**Note:** There's currently no way of setting the serializer data for the parser without loading the models. If this is relevant to your test, constructing the `Doc` via `get_doc()` won't work.
|
||||
|
||||
### Other utilities
|
||||
|
||||
| Name | Description |
|
||||
| --- | --- |
|
||||
| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
|
||||
| `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
|
||||
| `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |
|
||||
| `assert_docs_equal(doc1, doc2)` | Compare two `Doc` objects and `assert` that they're equal. Tests for tokens, tags, dependencies and entities. |
|
||||
| Name | Description |
|
||||
| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
|
||||
| `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
|
||||
| `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |
|
||||
| `assert_docs_equal(doc1, doc2)` | Compare two `Doc` objects and `assert` that they're equal. Tests for tokens, tags, dependencies and entities. |
|
||||
|
||||
## Contributing to the tests
|
||||
|
||||
|
|
|
@ -59,6 +59,11 @@ def de_tokenizer():
|
|||
return get_lang_class("de")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def de_vocab():
|
||||
return get_lang_class("de")().vocab
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def el_tokenizer():
|
||||
return get_lang_class("el")().tokenizer
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||
from spacy.training import Example
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.tokens import Span, Doc
|
||||
from spacy import registry
|
||||
import pytest
|
||||
|
||||
from ..util import get_doc
|
||||
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||
|
||||
|
||||
def _ner_example(ner):
|
||||
doc = Doc(
|
||||
|
@ -19,7 +17,7 @@ def _ner_example(ner):
|
|||
|
||||
def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||
text = ["This", "is", "a", "lion"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
doc = Doc(en_vocab, words=text)
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
|
@ -41,7 +39,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
|||
def test_ents_reset(en_vocab):
|
||||
"""Ensure that resetting doc.ents does not change anything"""
|
||||
text = ["This", "is", "a", "lion"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
doc = Doc(en_vocab, words=text)
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
|
@ -59,7 +57,7 @@ def test_ents_reset(en_vocab):
|
|||
|
||||
def test_add_overlapping_entities(en_vocab):
|
||||
text = ["Louisiana", "Office", "of", "Conservation"]
|
||||
doc = get_doc(en_vocab, text)
|
||||
doc = Doc(en_vocab, words=text)
|
||||
entity = Span(doc, 0, 4, label=391)
|
||||
doc.ents = [entity]
|
||||
|
||||
|
|
|
@ -2,8 +2,6 @@ import pytest
|
|||
from spacy.tokens import Doc
|
||||
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_doc_array_attr_of_token(en_vocab):
|
||||
doc = Doc(en_vocab, words=["An", "example", "sentence"])
|
||||
|
@ -35,7 +33,7 @@ def test_doc_scalar_attr_of_token(en_vocab):
|
|||
def test_doc_array_tag(en_vocab):
|
||||
words = ["A", "nice", "sentence", "."]
|
||||
pos = ["DET", "ADJ", "NOUN", "PUNCT"]
|
||||
doc = get_doc(en_vocab, words=words, pos=pos)
|
||||
doc = Doc(en_vocab, words=words, pos=pos)
|
||||
assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
|
||||
feats_array = doc.to_array((ORTH, POS))
|
||||
assert feats_array[0][1] == doc[0].pos
|
||||
|
@ -47,7 +45,7 @@ def test_doc_array_tag(en_vocab):
|
|||
def test_doc_array_morph(en_vocab):
|
||||
words = ["Eat", "blue", "ham"]
|
||||
morph = ["Feat=V", "Feat=J", "Feat=N"]
|
||||
doc = get_doc(en_vocab, words=words, morphs=morph)
|
||||
doc = Doc(en_vocab, words=words, morphs=morph)
|
||||
assert morph[0] == doc[0].morph_
|
||||
assert morph[1] == doc[1].morph_
|
||||
assert morph[2] == doc[2].morph_
|
||||
|
@ -61,7 +59,7 @@ def test_doc_array_morph(en_vocab):
|
|||
def test_doc_array_dep(en_vocab):
|
||||
words = ["A", "nice", "sentence", "."]
|
||||
deps = ["det", "amod", "ROOT", "punct"]
|
||||
doc = get_doc(en_vocab, words=words, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, deps=deps)
|
||||
feats_array = doc.to_array((ORTH, DEP))
|
||||
assert feats_array[0][1] == doc[0].dep
|
||||
assert feats_array[1][1] == doc[1].dep
|
||||
|
|
|
@ -6,7 +6,22 @@ from spacy.lexeme import Lexeme
|
|||
from spacy.lang.en import English
|
||||
from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
def test_doc_api_init(en_vocab):
|
||||
words = ["a", "b", "c", "d"]
|
||||
heads = [0, 0, 2, 2]
|
||||
# set sent_start by sent_starts
|
||||
doc = Doc(en_vocab, words=words, sent_starts=[True, False, True, False])
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
|
||||
# set sent_start by heads
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * 4)
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
# heads override sent_starts
|
||||
doc = Doc(
|
||||
en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
|
||||
)
|
||||
assert [t.is_sent_start for t in doc] == [True, False, True, False]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", [["one", "two", "three"]])
|
||||
|
@ -158,7 +173,7 @@ def test_doc_api_runtime_error(en_tokenizer):
|
|||
"", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||
nps = []
|
||||
for np in doc.noun_chunks:
|
||||
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
|
||||
|
@ -175,17 +190,19 @@ def test_doc_api_runtime_error(en_tokenizer):
|
|||
retokenizer.merge(np, attrs=attrs)
|
||||
|
||||
|
||||
def test_doc_api_right_edge(en_tokenizer):
|
||||
def test_doc_api_right_edge(en_vocab):
|
||||
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
|
||||
# fmt: off
|
||||
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
||||
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
||||
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
||||
words = [
|
||||
"I", "have", "proposed", "to", "myself", ",", "for", "the", "sake",
|
||||
"of", "such", "as", "live", "under", "the", "government", "of", "the",
|
||||
"Romans", ",", "to", "translate", "those", "books", "into", "the",
|
||||
"Greek", "tongue", "."
|
||||
]
|
||||
heads = [2, 2, 2, 2, 3, 2, 21, 8, 6, 8, 11, 8, 11, 12, 15, 13, 15, 18, 16, 12, 21, 2, 23, 21, 21, 27, 27, 24, 2]
|
||||
deps = ["dep"] * len(heads)
|
||||
# fmt: on
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert doc[6].text == "for"
|
||||
subtree = [w.text for w in doc[6].subtree]
|
||||
# fmt: off
|
||||
|
@ -213,16 +230,16 @@ def test_doc_api_similarity_match():
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sentence,heads,lca_matrix",
|
||||
"words,heads,lca_matrix",
|
||||
[
|
||||
(
|
||||
"the lazy dog slept",
|
||||
[2, 1, 1, 0],
|
||||
["the", "lazy", "dog", "slept"],
|
||||
[2, 2, 3, 3],
|
||||
numpy.array([[0, 2, 2, 3], [2, 1, 2, 3], [2, 2, 2, 3], [3, 3, 3, 3]]),
|
||||
),
|
||||
(
|
||||
"The lazy dog slept. The quick fox jumped",
|
||||
[2, 1, 1, 0, -1, 2, 1, 1, 0],
|
||||
["The", "lazy", "dog", "slept", ".", "The", "quick", "fox", "jumped"],
|
||||
[2, 2, 3, 3, 3, 7, 7, 8, 8],
|
||||
numpy.array(
|
||||
[
|
||||
[0, 2, 2, 3, 3, -1, -1, -1, -1],
|
||||
|
@ -239,11 +256,8 @@ def test_doc_api_similarity_match():
|
|||
),
|
||||
],
|
||||
)
|
||||
def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
||||
tokens = en_tokenizer(sentence)
|
||||
doc = get_doc(
|
||||
tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
|
||||
)
|
||||
def test_lowest_common_ancestor(en_vocab, words, heads, lca_matrix):
|
||||
doc = Doc(en_vocab, words, heads=heads, deps=["dep"] * len(heads))
|
||||
lca = doc.get_lca_matrix()
|
||||
assert (lca == lca_matrix).all()
|
||||
assert lca[1, 1] == 1
|
||||
|
@ -267,26 +281,23 @@ def test_doc_is_nered(en_vocab):
|
|||
|
||||
|
||||
def test_doc_from_array_sent_starts(en_vocab):
|
||||
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
||||
heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3]
|
||||
# fmt: off
|
||||
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
||||
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
|
||||
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
|
||||
# fmt: on
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
# HEAD overrides SENT_START without warning
|
||||
attrs = [SENT_START, HEAD]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
new_doc.from_array(attrs, arr)
|
||||
|
||||
# no warning using default attrs
|
||||
attrs = doc._get_array_attrs()
|
||||
arr = doc.to_array(attrs)
|
||||
with pytest.warns(None) as record:
|
||||
new_doc.from_array(attrs, arr)
|
||||
assert len(record) == 0
|
||||
|
||||
# only SENT_START uses SENT_START
|
||||
attrs = [SENT_START]
|
||||
arr = doc.to_array(attrs)
|
||||
|
@ -294,7 +305,6 @@ def test_doc_from_array_sent_starts(en_vocab):
|
|||
new_doc.from_array(attrs, arr)
|
||||
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
|
||||
assert not new_doc.has_annotation("DEP")
|
||||
|
||||
# only HEAD uses HEAD
|
||||
attrs = [HEAD, DEP]
|
||||
arr = doc.to_array(attrs)
|
||||
|
@ -305,19 +315,17 @@ def test_doc_from_array_sent_starts(en_vocab):
|
|||
|
||||
|
||||
def test_doc_from_array_morph(en_vocab):
|
||||
words = ["I", "live", "in", "New", "York", "."]
|
||||
# fmt: off
|
||||
words = ["I", "live", "in", "New", "York", "."]
|
||||
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, morph in enumerate(morphs):
|
||||
doc[i].morph_ = morph
|
||||
|
||||
attrs = [MORPH]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
new_doc.from_array(attrs, arr)
|
||||
|
||||
assert [t.morph_ for t in new_doc] == morphs
|
||||
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
|
||||
|
||||
|
@ -329,15 +337,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
docs_idx = en_texts[0].index("docs")
|
||||
de_doc = de_tokenizer(de_text)
|
||||
en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (
|
||||
True,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
expected = (True, None, None, None)
|
||||
en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = expected
|
||||
assert Doc.from_docs([]) is None
|
||||
|
||||
assert de_doc is not Doc.from_docs([de_doc])
|
||||
assert str(de_doc) == str(Doc.from_docs([de_doc]))
|
||||
|
||||
|
@ -455,7 +457,7 @@ def test_is_flags_deprecated(en_tokenizer):
|
|||
doc.is_sentenced
|
||||
|
||||
|
||||
def test_set_ents(en_tokenizer):
|
||||
def test_doc_set_ents(en_tokenizer):
|
||||
# set ents
|
||||
doc = en_tokenizer("a b c d e")
|
||||
doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
|
||||
|
@ -520,3 +522,16 @@ def test_set_ents(en_tokenizer):
|
|||
# conflicting/overlapping specifications
|
||||
with pytest.raises(ValueError):
|
||||
doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]])
|
||||
|
||||
|
||||
def test_doc_ents_setter():
|
||||
"""Test that both strings and integers can be used to set entities in
|
||||
tuple format via doc.ents."""
|
||||
words = ["a", "b", "c", "d", "e"]
|
||||
doc = Doc(Vocab(), words=words)
|
||||
doc.ents = [("HELLO", 0, 2), (doc.vocab.strings.add("WORLD"), 3, 5)]
|
||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
||||
vocab = Vocab()
|
||||
ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
|
||||
doc = Doc(vocab, words=words, ents=ents)
|
||||
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
|
|
@ -3,8 +3,6 @@ from spacy.attrs import LEMMA
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc, Token
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_doc_retokenize_merge(en_tokenizer):
|
||||
text = "WKRO played songs by the beach boys all night"
|
||||
|
@ -88,9 +86,9 @@ def test_doc_retokenize_lex_attrs(en_tokenizer):
|
|||
|
||||
def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
|
||||
text = "Los Angeles start."
|
||||
heads = [1, 1, 0, -1]
|
||||
heads = [1, 2, 2, 2]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].head.text == "Angeles"
|
||||
assert doc[1].head.text == "start"
|
||||
|
@ -103,17 +101,12 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
|
|||
assert doc[0].ent_type_ == "GPE"
|
||||
|
||||
|
||||
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
||||
text = "The players start."
|
||||
heads = [1, 1, 0, -1]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
tags=["DT", "NN", "VBZ", "."],
|
||||
pos=["DET", "NOUN", "VERB", "PUNCT"],
|
||||
heads=heads,
|
||||
)
|
||||
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
|
||||
words = ["The", "players", "start", "."]
|
||||
heads = [1, 2, 2, 2]
|
||||
tags = ["DT", "NN", "VBZ", "."]
|
||||
pos = ["DET", "NOUN", "VERB", "PUNCT"]
|
||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].text == "The"
|
||||
assert doc[0].tag_ == "DT"
|
||||
|
@ -124,13 +117,7 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
|||
assert doc[0].text == "The players"
|
||||
assert doc[0].tag_ == "NN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
tags=["DT", "NN", "VBZ", "."],
|
||||
pos=["DET", "NOUN", "VERB", "PUNCT"],
|
||||
heads=heads,
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].text == "The"
|
||||
assert doc[0].tag_ == "DT"
|
||||
|
@ -147,11 +134,10 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
|||
assert doc[1].pos_ == "VERB"
|
||||
|
||||
|
||||
def test_doc_retokenize_spans_merge_heads(en_tokenizer):
|
||||
text = "I found a pilates class near work."
|
||||
heads = [1, 0, 2, 1, -3, -1, -1, -6]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
def test_doc_retokenize_spans_merge_heads(en_vocab):
|
||||
words = ["I", "found", "a", "pilates", "class", "near", "work", "."]
|
||||
heads = [1, 1, 4, 6, 1, 4, 5, 1]
|
||||
doc = Doc(en_vocab, words=words, heads=heads)
|
||||
assert len(doc) == 8
|
||||
with doc.retokenize() as retokenizer:
|
||||
attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"}
|
||||
|
@ -182,9 +168,9 @@ def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
|
|||
|
||||
def test_doc_retokenize_span_np_merges(en_tokenizer):
|
||||
text = "displaCy is a parse tool built with Javascript"
|
||||
heads = [1, 0, 2, 1, -3, -1, -1, -1]
|
||||
heads = [1, 1, 4, 4, 1, 4, 5, 6]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
assert doc[4].head.i == 1
|
||||
with doc.retokenize() as retokenizer:
|
||||
attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
|
||||
|
@ -192,18 +178,18 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
|
|||
assert doc[2].head.i == 1
|
||||
|
||||
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
|
||||
heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
|
||||
heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
with doc.retokenize() as retokenizer:
|
||||
for ent in doc.ents:
|
||||
attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
|
||||
retokenizer.merge(ent, attrs=attrs)
|
||||
|
||||
text = "One test with entities like New York City so the ents list is not void"
|
||||
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
|
||||
heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
with doc.retokenize() as retokenizer:
|
||||
for ent in doc.ents:
|
||||
retokenizer.merge(ent)
|
||||
|
@ -212,12 +198,12 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
|
|||
def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
|
||||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
||||
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||
ents = [(0, 2, "PERSON"), (10, 11, "GPE"), (13, 15, "PERSON")]
|
||||
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
|
||||
)
|
||||
assert len(doc) == 17
|
||||
|
@ -282,13 +268,13 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
|
||||
# if there is a parse, span.root provides default values
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [0, -1, 1, -3, -4, -5, -1, -7, -8]
|
||||
ents = [(3, 5, "ent-de"), (5, 7, "ent-fg")]
|
||||
heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
|
||||
ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
|
||||
deps = ["dep"] * len(words)
|
||||
en_vocab.strings.add("ent-de")
|
||||
en_vocab.strings.add("ent-fg")
|
||||
en_vocab.strings.add("dep")
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
assert doc[2:4].root == doc[3] # root of 'c d' is d
|
||||
assert doc[4:6].root == doc[4] # root is 'e f' is e
|
||||
with doc.retokenize() as retokenizer:
|
||||
|
@ -305,10 +291,10 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
|
||||
# check that B is preserved if span[start] is B
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [0, -1, 1, 1, -4, -5, -1, -7, -8]
|
||||
ents = [(3, 5, "ent-de"), (5, 7, "ent-de")]
|
||||
heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
|
||||
ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
|
||||
deps = ["dep"] * len(words)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[3:5])
|
||||
retokenizer.merge(doc[5:7])
|
||||
|
@ -322,13 +308,13 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
|||
def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
|
||||
heads = [1, 2, 2, 4, 2, 4, 4, 2, 9, 9, 9, 10, 9, 9, 15, 13, 9]
|
||||
deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
|
||||
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
|
||||
'compound', 'dobj', 'punct']
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
sent1, sent2 = list(doc.sents)
|
||||
init_len = len(sent1)
|
||||
init_len2 = len(sent2)
|
||||
|
@ -343,13 +329,13 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
|
|||
def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
|
||||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12]
|
||||
deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr",
|
||||
"nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
|
||||
"dobj"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
sent1 = list(doc.sents)[0]
|
||||
init_len = len(list(sent1.root.subtree))
|
||||
with doc.retokenize() as retokenizer:
|
||||
|
|
|
@ -2,13 +2,11 @@ import pytest
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc, Token
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_doc_retokenize_split(en_vocab):
|
||||
words = ["LosAngeles", "start", "."]
|
||||
heads = [1, 1, 0]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads)
|
||||
heads = [1, 2, 2]
|
||||
doc = Doc(en_vocab, words=words, heads=heads)
|
||||
assert len(doc) == 3
|
||||
assert len(str(doc)) == 19
|
||||
assert doc[0].head.text == "start"
|
||||
|
@ -88,11 +86,11 @@ def test_doc_retokenize_spans_sentence_update_after_split(en_vocab):
|
|||
# fmt: off
|
||||
words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
|
||||
"lives", "in", "England", "and", "loves", "JoePasquale", "."]
|
||||
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
|
||||
heads = [1, 1, 3, 5, 3, 1, 1, 8, 8, 8, 9, 8, 8, 14, 12]
|
||||
deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
|
||||
"ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
|
||||
# fmt: on
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
sent1, sent2 = list(doc.sents)
|
||||
init_len = len(sent1)
|
||||
init_len2 = len(sent2)
|
||||
|
|
|
@ -4,19 +4,17 @@ from spacy.tokens import Doc, Span
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.util import filter_spans
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "This is a sentence. This is another sentence. And a third."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
|
||||
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
|
||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -69,10 +67,10 @@ def test_spans_string_fn(doc):
|
|||
|
||||
def test_spans_root2(en_tokenizer):
|
||||
text = "through North and South Carolina"
|
||||
heads = [0, 3, -1, -2, -4]
|
||||
heads = [0, 4, 1, 1, 0]
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
assert doc[-2:].root.text == "Carolina"
|
||||
|
||||
|
||||
|
@ -92,10 +90,10 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
|||
def test_spans_lca_matrix(en_tokenizer):
|
||||
"""Test span's lca matrix generation"""
|
||||
tokens = en_tokenizer("the lazy dog slept")
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=[2, 1, 1, 0],
|
||||
heads=[2, 2, 3, 3],
|
||||
deps=["dep"] * 4,
|
||||
)
|
||||
lca = doc[:2].get_lca_matrix()
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -8,10 +7,10 @@ def doc(en_vocab):
|
|||
words = ["c", "d", "e"]
|
||||
pos = ["VERB", "NOUN", "NOUN"]
|
||||
tags = ["VBP", "NN", "NN"]
|
||||
heads = [0, -1, -2]
|
||||
heads = [0, 0, 0]
|
||||
deps = ["ROOT", "dobj", "dobj"]
|
||||
ents = [(1, 2, "ORG")]
|
||||
return get_doc(
|
||||
ents = [("ORG", 1, 2)]
|
||||
return Doc(
|
||||
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
||||
)
|
||||
|
||||
|
|
|
@ -5,31 +5,24 @@ from spacy.symbols import VERB
|
|||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
def doc(en_vocab):
|
||||
# fmt: off
|
||||
text = "This is a sentence. This is another sentence. And a third."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
|
||||
words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "sentence", ".", "And", "a", "third", "."]
|
||||
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 10, 12, 10, 12]
|
||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
return Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
|
||||
|
||||
def test_doc_token_api_strings(en_tokenizer):
|
||||
text = "Give it back! He pleaded."
|
||||
def test_doc_token_api_strings(en_vocab):
|
||||
words = ["Give", "it", "back", "!", "He", "pleaded", "."]
|
||||
pos = ["VERB", "PRON", "PART", "PUNCT", "PRON", "VERB", "PUNCT"]
|
||||
heads = [0, -1, -2, -3, 1, 0, -1]
|
||||
heads = [0, 0, 0, 0, 5, 5, 5]
|
||||
deps = ["ROOT", "dobj", "prt", "punct", "nsubj", "ROOT", "punct"]
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, heads=heads, deps=deps)
|
||||
assert doc[0].orth_ == "Give"
|
||||
assert doc[0].text == "Give"
|
||||
assert doc[0].text_with_ws == "Give "
|
||||
|
@ -97,88 +90,70 @@ def test_doc_token_api_vectors():
|
|||
assert doc[0].similarity(doc[1]) == cosine
|
||||
|
||||
|
||||
def test_doc_token_api_ancestors(en_tokenizer):
|
||||
def test_doc_token_api_ancestors(en_vocab):
|
||||
# the structure of this sentence depends on the English annotation scheme
|
||||
text = "Yesterday I saw a dog that barked loudly."
|
||||
heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
|
||||
heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
|
||||
doc = Doc(en_vocab, words=words, heads=heads)
|
||||
assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
|
||||
assert [t.text for t in doc[1].ancestors] == ["saw"]
|
||||
assert [t.text for t in doc[2].ancestors] == []
|
||||
|
||||
assert doc[2].is_ancestor(doc[7])
|
||||
assert not doc[6].is_ancestor(doc[2])
|
||||
|
||||
|
||||
def test_doc_token_api_head_setter(en_tokenizer):
|
||||
text = "Yesterday I saw a dog that barked loudly."
|
||||
heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
|
||||
def test_doc_token_api_head_setter(en_vocab):
|
||||
words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
|
||||
heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert doc[6].n_lefts == 1
|
||||
assert doc[6].n_rights == 1
|
||||
assert doc[6].left_edge.i == 5
|
||||
assert doc[6].right_edge.i == 7
|
||||
|
||||
assert doc[4].n_lefts == 1
|
||||
assert doc[4].n_rights == 1
|
||||
assert doc[4].left_edge.i == 3
|
||||
assert doc[4].right_edge.i == 7
|
||||
|
||||
assert doc[3].n_lefts == 0
|
||||
assert doc[3].n_rights == 0
|
||||
assert doc[3].left_edge.i == 3
|
||||
assert doc[3].right_edge.i == 3
|
||||
|
||||
assert doc[2].left_edge.i == 0
|
||||
assert doc[2].right_edge.i == 8
|
||||
|
||||
doc[6].head = doc[3]
|
||||
|
||||
assert doc[6].n_lefts == 1
|
||||
assert doc[6].n_rights == 1
|
||||
assert doc[6].left_edge.i == 5
|
||||
assert doc[6].right_edge.i == 7
|
||||
|
||||
assert doc[3].n_lefts == 0
|
||||
assert doc[3].n_rights == 1
|
||||
assert doc[3].left_edge.i == 3
|
||||
assert doc[3].right_edge.i == 7
|
||||
|
||||
assert doc[4].n_lefts == 1
|
||||
assert doc[4].n_rights == 0
|
||||
assert doc[4].left_edge.i == 3
|
||||
assert doc[4].right_edge.i == 7
|
||||
|
||||
assert doc[2].left_edge.i == 0
|
||||
assert doc[2].right_edge.i == 8
|
||||
|
||||
doc[0].head = doc[5]
|
||||
|
||||
assert doc[5].left_edge.i == 0
|
||||
assert doc[6].left_edge.i == 0
|
||||
assert doc[3].left_edge.i == 0
|
||||
assert doc[4].left_edge.i == 0
|
||||
assert doc[2].left_edge.i == 0
|
||||
|
||||
# head token must be from the same document
|
||||
doc2 = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc2 = Doc(en_vocab, words=words, heads=heads)
|
||||
with pytest.raises(ValueError):
|
||||
doc[0].head = doc2[0]
|
||||
|
||||
# test sentence starts when two sentences are joined
|
||||
text = "This is one sentence. This is another sentence."
|
||||
heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=heads,
|
||||
deps=["dep"] * len(heads),
|
||||
)
|
||||
# fmt: off
|
||||
words = ["This", "is", "one", "sentence", ".", "This", "is", "another", "sentence", "."]
|
||||
heads = [0, 0, 0, 0, 0, 5, 5, 5, 5, 5]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
|
||||
# initially two sentences
|
||||
assert doc[0].is_sent_start
|
||||
assert doc[5].is_sent_start
|
||||
|
@ -186,7 +161,6 @@ def test_doc_token_api_head_setter(en_tokenizer):
|
|||
assert doc[0].right_edge == doc[4]
|
||||
assert doc[5].left_edge == doc[5]
|
||||
assert doc[5].right_edge == doc[9]
|
||||
|
||||
# modifying with a sentence doesn't change sent starts
|
||||
doc[2].head = doc[3]
|
||||
assert doc[0].is_sent_start
|
||||
|
@ -195,7 +169,6 @@ def test_doc_token_api_head_setter(en_tokenizer):
|
|||
assert doc[0].right_edge == doc[4]
|
||||
assert doc[5].left_edge == doc[5]
|
||||
assert doc[5].right_edge == doc[9]
|
||||
|
||||
# attach the second sentence to the first, resulting in one sentence
|
||||
doc[5].head = doc[0]
|
||||
assert doc[0].is_sent_start
|
||||
|
@ -252,28 +225,28 @@ def test_tokenlast_has_sent_end_true():
|
|||
|
||||
|
||||
def test_token_api_conjuncts_chain(en_vocab):
|
||||
words = "The boy and the girl and the man went .".split()
|
||||
heads = [1, 7, -1, 1, -3, -1, 1, -3, 0, -1]
|
||||
words = ["The", "boy", "and", "the", "girl", "and", "the", "man", "went", "."]
|
||||
heads = [1, 8, 1, 4, 1, 4, 7, 4, 8, 8]
|
||||
deps = ["det", "nsubj", "cc", "det", "conj", "cc", "det", "conj", "ROOT", "punct"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert [w.text for w in doc[1].conjuncts] == ["girl", "man"]
|
||||
assert [w.text for w in doc[4].conjuncts] == ["boy", "man"]
|
||||
assert [w.text for w in doc[7].conjuncts] == ["boy", "girl"]
|
||||
|
||||
|
||||
def test_token_api_conjuncts_simple(en_vocab):
|
||||
words = "They came and went .".split()
|
||||
heads = [1, 0, -1, -2, -1]
|
||||
words = ["They", "came", "and", "went", "."]
|
||||
heads = [1, 1, 1, 1, 3]
|
||||
deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert [w.text for w in doc[1].conjuncts] == ["went"]
|
||||
assert [w.text for w in doc[3].conjuncts] == ["came"]
|
||||
|
||||
|
||||
def test_token_api_non_conjuncts(en_vocab):
|
||||
words = "They came .".split()
|
||||
heads = [1, 0, -1]
|
||||
words = ["They", "came", "."]
|
||||
heads = [1, 1, 1]
|
||||
deps = ["nsubj", "ROOT", "punct"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert [w.text for w in doc[0].conjuncts] == []
|
||||
assert [w.text for w in doc[1].conjuncts] == []
|
||||
|
|
|
@ -1,30 +1,26 @@
|
|||
from ...util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
|
||||
text = "Eine Tasse steht auf dem Tisch."
|
||||
heads = [1, 1, 0, -1, 1, -2, -4]
|
||||
def test_de_parser_noun_chunks_standard_de(de_vocab):
|
||||
words = ["Eine", "Tasse", "steht", "auf", "dem", "Tisch", "."]
|
||||
heads = [1, 2, 2, 2, 5, 3, 2]
|
||||
pos = ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
|
||||
deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "punct"]
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(de_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "Eine Tasse "
|
||||
assert chunks[1].text_with_ws == "dem Tisch "
|
||||
|
||||
|
||||
def test_de_extended_chunk(de_tokenizer):
|
||||
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
|
||||
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
|
||||
def test_de_extended_chunk(de_vocab):
|
||||
# fmt: off
|
||||
words = ["Die", "Sängerin", "singt", "mit", "einer", "Tasse", "Kaffee", "Arien", "."]
|
||||
heads = [1, 2, 2, 2, 5, 3, 5, 2, 2]
|
||||
pos = ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN", "NOUN", "NOUN", "PUNCT"]
|
||||
deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "nk", "oa", "punct"]
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
# fmt: on
|
||||
doc = Doc(de_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "Die Sängerin "
|
||||
|
|
|
@ -2,13 +2,10 @@ import numpy
|
|||
from spacy.attrs import HEAD, DEP
|
||||
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
|
||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||
|
||||
from spacy.tokens import Doc
|
||||
import pytest
|
||||
|
||||
|
||||
from ...util import get_doc
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
||||
"""
|
||||
|
@ -19,9 +16,9 @@ def test_noun_chunks_is_parsed(en_tokenizer):
|
|||
|
||||
def test_en_noun_chunks_not_nested(en_vocab):
|
||||
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
|
||||
heads = [1, 0, 4, 3, -1, -2, -5]
|
||||
heads = [1, 1, 6, 6, 3, 3, 1]
|
||||
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc.from_array(
|
||||
[HEAD, DEP],
|
||||
numpy.asarray(
|
||||
|
|
|
@ -1,63 +1,51 @@
|
|||
from ...util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_standard(en_tokenizer):
|
||||
text = "A base phrase should be recognized."
|
||||
heads = [2, 1, 3, 2, 1, 0, -1]
|
||||
def test_en_parser_noun_chunks_standard(en_vocab):
|
||||
words = ["A", "base", "phrase", "should", "be", "recognized", "."]
|
||||
heads = [2, 2, 5, 5, 5, 5, 5]
|
||||
pos = ["DET", "ADJ", "NOUN", "AUX", "VERB", "VERB", "PUNCT"]
|
||||
deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].text_with_ws == "A base phrase "
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_coordinated(en_tokenizer):
|
||||
def test_en_parser_noun_chunks_coordinated(en_vocab):
|
||||
# fmt: off
|
||||
text = "A base phrase and a good phrase are often the same."
|
||||
heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
|
||||
words = ["A", "base", "phrase", "and", "a", "good", "phrase", "are", "often", "the", "same", "."]
|
||||
heads = [2, 2, 7, 2, 6, 6, 2, 7, 7, 10, 7, 7]
|
||||
pos = ["DET", "NOUN", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "VERB", "ADV", "DET", "ADJ", "PUNCT"]
|
||||
deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "A base phrase "
|
||||
assert chunks[1].text_with_ws == "a good phrase "
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
|
||||
text = "A phrase with another phrase occurs."
|
||||
heads = [1, 4, -1, 1, -2, 0, -1]
|
||||
def test_en_parser_noun_chunks_pp_chunks(en_vocab):
|
||||
words = ["A", "phrase", "with", "another", "phrase", "occurs", "."]
|
||||
heads = [1, 5, 1, 4, 2, 5, 5]
|
||||
pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB", "PUNCT"]
|
||||
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "A phrase "
|
||||
assert chunks[1].text_with_ws == "another phrase "
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
|
||||
def test_en_parser_noun_chunks_appositional_modifiers(en_vocab):
|
||||
# fmt: off
|
||||
text = "Sam, my brother, arrived to the house."
|
||||
heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
|
||||
words = ["Sam", ",", "my", "brother", ",", "arrived", "to", "the", "house", "."]
|
||||
heads = [5, 0, 3, 0, 0, 5, 5, 8, 6, 5]
|
||||
pos = ["PROPN", "PUNCT", "DET", "NOUN", "PUNCT", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
|
||||
deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "Sam "
|
||||
|
@ -65,15 +53,12 @@ def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
|
|||
assert chunks[2].text_with_ws == "the house "
|
||||
|
||||
|
||||
def test_en_parser_noun_chunks_dative(en_tokenizer):
|
||||
text = "She gave Bob a raise."
|
||||
heads = [1, 0, -1, 1, -3, -4]
|
||||
def test_en_parser_noun_chunks_dative(en_vocab):
|
||||
words = ["She", "gave", "Bob", "a", "raise", "."]
|
||||
heads = [1, 1, 1, 4, 1, 1]
|
||||
pos = ["PRON", "VERB", "PROPN", "DET", "NOUN", "PUNCT"]
|
||||
deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "She "
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
|
||||
from ...util import get_doc, apply_transition_sequence
|
||||
from ...util import apply_transition_sequence
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["A test sentence"])
|
||||
@pytest.mark.parametrize("words", [["A", "test", "sentence"]])
|
||||
@pytest.mark.parametrize("punct", [".", "!", "?", ""])
|
||||
def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
||||
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
||||
def test_en_sbd_single_punct(en_vocab, words, punct):
|
||||
heads = [2, 2, 2, 2] if punct else [2, 2, 2]
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text + punct)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
words = [*words, punct] if punct else words
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert len(doc) == 4 if punct else 3
|
||||
assert len(list(doc.sents)) == 1
|
||||
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
||||
|
@ -18,17 +19,16 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
|||
@pytest.mark.skip(
|
||||
reason="The step_through API was removed (but should be brought back)"
|
||||
)
|
||||
def test_en_sentence_breaks(en_tokenizer, en_parser):
|
||||
def test_en_sentence_breaks(en_vocab, en_parser):
|
||||
# fmt: off
|
||||
text = "This is a sentence . This is another one ."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
|
||||
words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "one", "."]
|
||||
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
|
||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||
"attr", "punct"]
|
||||
transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct", "B-ROOT",
|
||||
"L-nsubj", "S", "L-attr", "R-attr", "D", "R-punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
apply_transition_sequence(en_parser, doc, transition)
|
||||
assert len(list(doc.sents)) == 2
|
||||
for token in doc:
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import pytest
|
||||
|
||||
from ...util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_ru_doc_lemmatization(ru_lemmatizer):
|
||||
|
@ -11,7 +10,7 @@ def test_ru_doc_lemmatization(ru_lemmatizer):
|
|||
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
||||
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
||||
]
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
|
||||
doc = Doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
|
||||
doc = ru_lemmatizer(doc)
|
||||
lemmas = [token.lemma_ for token in doc]
|
||||
assert lemmas == ["мама", "мыть", "рама"]
|
||||
|
@ -28,7 +27,7 @@ def test_ru_doc_lemmatization(ru_lemmatizer):
|
|||
],
|
||||
)
|
||||
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
|
||||
doc = Doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
|
||||
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
|
||||
assert sorted(result_lemmas) == lemmas
|
||||
|
||||
|
@ -51,7 +50,7 @@ def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
|||
def test_ru_lemmatizer_works_with_different_pos_homonyms(
|
||||
ru_lemmatizer, text, pos, morph, lemma
|
||||
):
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
|
||||
doc = Doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
|
||||
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
|
||||
assert result_lemmas == [lemma]
|
||||
|
||||
|
@ -66,13 +65,13 @@ def test_ru_lemmatizer_works_with_different_pos_homonyms(
|
|||
],
|
||||
)
|
||||
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
|
||||
doc = Doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
|
||||
result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
|
||||
assert result_lemmas == [lemma]
|
||||
|
||||
|
||||
def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
|
||||
doc = Doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
|
||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||
doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
||||
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import pytest
|
||||
|
||||
from ...util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
||||
|
@ -16,21 +15,21 @@ SV_NP_TEST_EXAMPLES = [
|
|||
"En student läste en bok", # A student read a book
|
||||
["DET", "NOUN", "VERB", "DET", "NOUN"],
|
||||
["det", "nsubj", "ROOT", "det", "dobj"],
|
||||
[1, 1, 0, 1, -2],
|
||||
[1, 2, 2, 4, 2],
|
||||
["En student", "en bok"],
|
||||
),
|
||||
(
|
||||
"Studenten läste den bästa boken.", # The student read the best book
|
||||
["NOUN", "VERB", "DET", "ADJ", "NOUN", "PUNCT"],
|
||||
["nsubj", "ROOT", "det", "amod", "dobj", "punct"],
|
||||
[1, 0, 2, 1, -3, -4],
|
||||
[1, 1, 4, 4, 1, 1],
|
||||
["Studenten", "den bästa boken"],
|
||||
),
|
||||
(
|
||||
"De samvetslösa skurkarna hade stulit de största juvelerna på söndagen", # The remorseless crooks had stolen the largest jewels that sunday
|
||||
["DET", "ADJ", "NOUN", "VERB", "VERB", "DET", "ADJ", "NOUN", "ADP", "NOUN"],
|
||||
["det", "amod", "nsubj", "aux", "root", "det", "amod", "dobj", "case", "nmod"],
|
||||
[2, 1, 2, 1, 0, 2, 1, -3, 1, -5],
|
||||
[2, 2, 4, 4, 4, 7, 7, 4, 9, 4],
|
||||
["De samvetslösa skurkarna", "de största juvelerna", "på söndagen"],
|
||||
),
|
||||
]
|
||||
|
@ -41,12 +40,9 @@ SV_NP_TEST_EXAMPLES = [
|
|||
)
|
||||
def test_sv_noun_chunks(sv_tokenizer, text, pos, deps, heads, expected_noun_chunks):
|
||||
tokens = sv_tokenizer(text)
|
||||
|
||||
assert len(heads) == len(pos)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, pos=pos
|
||||
)
|
||||
|
||||
words = [t.text for t in tokens]
|
||||
doc = Doc(tokens.vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||
noun_chunks = list(doc.noun_chunks)
|
||||
assert len(noun_chunks) == len(expected_noun_chunks)
|
||||
for i, np in enumerate(noun_chunks):
|
||||
|
|
|
@ -4,16 +4,15 @@ import re
|
|||
import copy
|
||||
from mock import Mock
|
||||
from spacy.matcher import DependencyMatcher
|
||||
from ..util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_vocab):
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
words = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "fox"]
|
||||
heads = [3, 3, 3, 4, 4, 4, 8, 8, 5]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
return doc
|
||||
return Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -236,10 +235,10 @@ def test_dependency_matcher_callback(en_vocab, doc):
|
|||
@pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
|
||||
def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
||||
# two sentences to test that all matches are within the same sentence
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
en_vocab,
|
||||
words=["a", "b", "c", "d", "e"] * 2,
|
||||
heads=[0, -1, -2, -3, -4] * 2,
|
||||
heads=[0, 0, 0, 0, 0, 5, 5, 5, 5, 5],
|
||||
deps=["dep"] * 10,
|
||||
)
|
||||
match_count = 0
|
||||
|
|
|
@ -3,7 +3,6 @@ import srsly
|
|||
from mock import Mock
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc, Span
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_matcher_phrase_matcher(en_vocab):
|
||||
|
@ -140,10 +139,10 @@ def test_phrase_matcher_string_attrs(en_vocab):
|
|||
pos1 = ["PRON", "VERB", "NOUN"]
|
||||
words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"]
|
||||
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
|
||||
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
||||
pattern = Doc(en_vocab, words=words1, pos=pos1)
|
||||
matcher = PhraseMatcher(en_vocab, attr="POS")
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
||||
doc = Doc(en_vocab, words=words2, pos=pos2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
match_id, start, end = matches[0]
|
||||
|
@ -158,10 +157,10 @@ def test_phrase_matcher_string_attrs_negative(en_vocab):
|
|||
pos1 = ["PRON", "VERB", "NOUN"]
|
||||
words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"]
|
||||
pos2 = ["X", "X", "X"]
|
||||
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
||||
pattern = Doc(en_vocab, words=words1, pos=pos1)
|
||||
matcher = PhraseMatcher(en_vocab, attr="POS")
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
||||
doc = Doc(en_vocab, words=words2, pos=pos2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
||||
|
||||
|
|
|
@ -2,8 +2,7 @@ import pytest
|
|||
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
|
||||
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
|
||||
from spacy.pipeline._parser_internals import nonproj
|
||||
|
||||
from ..util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -74,16 +73,10 @@ def test_parser_is_nonproj_tree(
|
|||
assert is_nonproj_tree(multirooted_tree) is True
|
||||
|
||||
|
||||
def test_parser_pseudoprojectivity(en_tokenizer):
|
||||
def test_parser_pseudoprojectivity(en_vocab):
|
||||
def deprojectivize(proj_heads, deco_labels):
|
||||
tokens = en_tokenizer("whatever " * len(proj_heads))
|
||||
rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
deps=deco_labels,
|
||||
heads=rel_proj_heads,
|
||||
)
|
||||
words = ["whatever "] * len(proj_heads)
|
||||
doc = Doc(en_vocab, words=words, deps=deco_labels, heads=proj_heads)
|
||||
nonproj.deprojectivize(doc)
|
||||
return [t.head.i for t in doc], [token.dep_ for token in doc]
|
||||
|
||||
|
@ -94,49 +87,39 @@ def test_parser_pseudoprojectivity(en_tokenizer):
|
|||
labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
|
||||
labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
|
||||
# fmt: on
|
||||
|
||||
assert nonproj.decompose("X||Y") == ("X", "Y")
|
||||
assert nonproj.decompose("X") == ("X", "")
|
||||
assert nonproj.is_decorated("X||Y") is True
|
||||
assert nonproj.is_decorated("X") is False
|
||||
|
||||
nonproj._lift(0, tree)
|
||||
assert tree == [2, 2, 2]
|
||||
|
||||
assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
|
||||
assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
|
||||
|
||||
# fmt: off
|
||||
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
|
||||
assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
||||
assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
||||
"nsubj", "acl||dobj", "punct"]
|
||||
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||
assert deproj_heads == nonproj_tree
|
||||
assert undeco_labels == labels
|
||||
|
||||
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
|
||||
assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
||||
assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod",
|
||||
"det", "dobj", "det", "nmod", "aux", "nmod||dobj",
|
||||
"advmod", "det", "amod", "punct"]
|
||||
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||
assert deproj_heads == nonproj_tree2
|
||||
assert undeco_labels == labels2
|
||||
|
||||
# if decoration is wrong such that there is no head with the desired label
|
||||
# the structure is kept and the label is undecorated
|
||||
proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
||||
deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj",
|
||||
"acl||iobj", "punct"]
|
||||
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||
assert deproj_heads == proj_heads
|
||||
assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
||||
"nsubj", "acl", "punct"]
|
||||
|
||||
# if there are two potential new heads, the first one is chosen even if
|
||||
# it"s wrong
|
||||
proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import pytest
|
||||
|
||||
from spacy.lang.en import English
|
||||
from ..util import get_doc, apply_transition_sequence, make_tempdir
|
||||
from ... import util
|
||||
from ...training import Example
|
||||
from spacy.training import Example
|
||||
from spacy.tokens import Doc
|
||||
from spacy import util
|
||||
|
||||
from ..util import apply_transition_sequence, make_tempdir
|
||||
|
||||
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
|
@ -23,12 +25,11 @@ TRAIN_DATA = [
|
|||
]
|
||||
|
||||
|
||||
def test_parser_root(en_tokenizer):
|
||||
text = "i don't have other assistance"
|
||||
heads = [3, 2, 1, 0, 1, -2]
|
||||
def test_parser_root(en_vocab):
|
||||
words = ["i", "do", "n't", "have", "other", "assistance"]
|
||||
heads = [3, 3, 3, 3, 5, 3]
|
||||
deps = ["nsubj", "aux", "neg", "ROOT", "amod", "dobj"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
for t in doc:
|
||||
assert t.dep != 0, t.text
|
||||
|
||||
|
@ -36,13 +37,9 @@ def test_parser_root(en_tokenizer):
|
|||
@pytest.mark.skip(
|
||||
reason="The step_through API was removed (but should be brought back)"
|
||||
)
|
||||
@pytest.mark.parametrize("text", ["Hello"])
|
||||
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("words", [["Hello"]])
|
||||
def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
|
||||
doc = Doc(en_vocab, words=words, heads=[0], deps=["ROOT"])
|
||||
assert len(doc) == 1
|
||||
with en_parser.step_through(doc) as _: # noqa: F841
|
||||
pass
|
||||
|
@ -52,24 +49,22 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
|||
@pytest.mark.skip(
|
||||
reason="The step_through API was removed (but should be brought back)"
|
||||
)
|
||||
def test_parser_initial(en_tokenizer, en_parser):
|
||||
text = "I ate the pizza with anchovies."
|
||||
# heads = [1, 0, 1, -2, -3, -1, -5]
|
||||
def test_parser_initial(en_vocab, en_parser):
|
||||
words = ["I", "ate", "the", "pizza", "with", "anchovies", "."]
|
||||
transition = ["L-nsubj", "S", "L-det"]
|
||||
tokens = en_tokenizer(text)
|
||||
apply_transition_sequence(en_parser, tokens, transition)
|
||||
assert tokens[0].head.i == 1
|
||||
assert tokens[1].head.i == 1
|
||||
assert tokens[2].head.i == 3
|
||||
assert tokens[3].head.i == 3
|
||||
doc = Doc(en_vocab, words=words)
|
||||
apply_transition_sequence(en_parser, doc, transition)
|
||||
assert doc[0].head.i == 1
|
||||
assert doc[1].head.i == 1
|
||||
assert doc[2].head.i == 3
|
||||
assert doc[3].head.i == 3
|
||||
|
||||
|
||||
def test_parser_parse_subtrees(en_tokenizer, en_parser):
|
||||
text = "The four wheels on the bus turned quickly"
|
||||
heads = [2, 1, 4, -1, 1, -2, 0, -1]
|
||||
def test_parser_parse_subtrees(en_vocab, en_parser):
|
||||
words = ["The", "four", "wheels", "on", "the", "bus", "turned", "quickly"]
|
||||
heads = [2, 2, 6, 2, 5, 3, 6, 6]
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert len(list(doc[2].lefts)) == 2
|
||||
assert len(list(doc[2].rights)) == 1
|
||||
assert len(list(doc[2].children)) == 3
|
||||
|
@ -79,15 +74,12 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser):
|
|||
assert len(list(doc[2].subtree)) == 6
|
||||
|
||||
|
||||
def test_parser_merge_pp(en_tokenizer):
|
||||
text = "A phrase with another phrase occurs"
|
||||
heads = [1, 4, -1, 1, -2, 0]
|
||||
def test_parser_merge_pp(en_vocab):
|
||||
words = ["A", "phrase", "with", "another", "phrase", "occurs"]
|
||||
heads = [1, 5, 1, 4, 2, 5]
|
||||
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
|
||||
pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos
|
||||
)
|
||||
doc = Doc(en_vocab, words=words, deps=deps, heads=heads, pos=pos)
|
||||
with doc.retokenize() as retokenizer:
|
||||
for np in doc.noun_chunks:
|
||||
retokenizer.merge(np, attrs={"lemma": np.lemma_})
|
||||
|
@ -100,12 +92,11 @@ def test_parser_merge_pp(en_tokenizer):
|
|||
@pytest.mark.skip(
|
||||
reason="The step_through API was removed (but should be brought back)"
|
||||
)
|
||||
def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
||||
text = "a b c d e"
|
||||
|
||||
def test_parser_arc_eager_finalize_state(en_vocab, en_parser):
|
||||
words = ["a", "b", "c", "d", "e"]
|
||||
# right branching
|
||||
transition = ["R-nsubj", "D", "R-nsubj", "R-nsubj", "D", "R-ROOT"]
|
||||
tokens = en_tokenizer(text)
|
||||
tokens = Doc(en_vocab, words=words)
|
||||
apply_transition_sequence(en_parser, tokens, transition)
|
||||
|
||||
assert tokens[0].n_lefts == 0
|
||||
|
@ -140,7 +131,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
|||
|
||||
# left branching
|
||||
transition = ["S", "S", "S", "L-nsubj", "L-nsubj", "L-nsubj", "L-nsubj"]
|
||||
tokens = en_tokenizer(text)
|
||||
tokens = Doc(en_vocab, words=words)
|
||||
apply_transition_sequence(en_parser, tokens, transition)
|
||||
|
||||
assert tokens[0].n_lefts == 0
|
||||
|
@ -177,10 +168,10 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
|||
def test_parser_set_sent_starts(en_vocab):
|
||||
# fmt: off
|
||||
words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
|
||||
heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
|
||||
heads = [1, 1, 1, 30, 4, 4, 7, 4, 7, 17, 14, 14, 11, 14, 17, 16, 17, 6, 17, 20, 11, 20, 26, 22, 26, 26, 20, 26, 29, 31, 31, 25, 31, 32, 17, 4, 4, 36]
|
||||
deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
|
||||
# fmt: on
|
||||
doc = get_doc(en_vocab, words=words, deps=deps, heads=heads)
|
||||
doc = Doc(en_vocab, words=words, deps=deps, heads=heads)
|
||||
for i in range(len(words)):
|
||||
if i == 0 or i == 3:
|
||||
assert doc[i].is_sent_start is True
|
||||
|
@ -201,24 +192,21 @@ def test_overfitting_IO():
|
|||
for dep in annotations.get("deps", []):
|
||||
parser.add_label(dep)
|
||||
optimizer = nlp.begin_training()
|
||||
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["parser"] < 0.0001
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like securities."
|
||||
doc = nlp(test_text)
|
||||
assert doc[0].dep_ is "nsubj"
|
||||
assert doc[2].dep_ is "dobj"
|
||||
assert doc[3].dep_ is "punct"
|
||||
|
||||
assert doc[0].dep_ == "nsubj"
|
||||
assert doc[2].dep_ == "dobj"
|
||||
assert doc[3].dep_ == "punct"
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
assert doc2[0].dep_ is "nsubj"
|
||||
assert doc2[2].dep_ is "dobj"
|
||||
assert doc2[3].dep_ is "punct"
|
||||
assert doc2[0].dep_ == "nsubj"
|
||||
assert doc2[2].dep_ == "dobj"
|
||||
assert doc2[3].dep_ == "punct"
|
||||
|
|
|
@ -1,59 +1,75 @@
|
|||
import pytest
|
||||
|
||||
from ..util import get_doc
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def text():
|
||||
return """
|
||||
It was a bright cold day in April, and the clocks were striking thirteen.
|
||||
Winston Smith, his chin nuzzled into his breast in an effort to escape the
|
||||
vile wind, slipped quickly through the glass doors of Victory Mansions,
|
||||
though not quickly enough to prevent a swirl of gritty dust from entering
|
||||
along with him.
|
||||
|
||||
The hallway smelt of boiled cabbage and old rag mats. At one end of it a
|
||||
coloured poster, too large for indoor display, had been tacked to the wall.
|
||||
It depicted simply an enormous face, more than a metre wide: the face of a
|
||||
man of about forty-five, with a heavy black moustache and ruggedly handsome
|
||||
features. Winston made for the stairs. It was no use trying the lift. Even at
|
||||
the best of times it was seldom working, and at present the electric current
|
||||
was cut off during daylight hours. It was part of the economy drive in
|
||||
preparation for Hate Week. The flat was seven flights up, and Winston, who
|
||||
was thirty-nine and had a varicose ulcer above his right ankle, went slowly,
|
||||
resting several times on the way. On each landing, opposite the lift-shaft,
|
||||
the poster with the enormous face gazed from the wall. It was one of those
|
||||
pictures which are so contrived that the eyes follow you about when you move.
|
||||
BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
|
||||
"""
|
||||
def words():
|
||||
# fmt: off
|
||||
return [
|
||||
"\n", "It", "was", "a", "bright", "cold", "day", "in", "April", ",",
|
||||
"and", "the", "clocks", "were", "striking", "thirteen", ".", "\n",
|
||||
"Winston", "Smith", ",", "his", "chin", "nuzzled", "into", "his",
|
||||
"breast", "in", "an", "effort", "to", "escape", "the", "\n", "vile",
|
||||
"wind", ",", "slipped", "quickly", "through", "the", "glass", "doors",
|
||||
"of", "Victory", "Mansions", ",", "\n", "though", "not", "quickly",
|
||||
"enough", "to", "prevent", "a", "swirl", "of", "gritty", "dust",
|
||||
"from", "entering", "\n", "along", "with", "him", ".", "\n\n", "The",
|
||||
"hallway", "smelt", "of", "boiled", "cabbage", "and", "old", "rag",
|
||||
"mats", ".", "At", "one", "end", "of", "it", "a", "\n", "coloured",
|
||||
"poster", ",", "too", "large", "for", "indoor", "display", ",", "had",
|
||||
"been", "tacked", "to", "the", "wall", ".", "\n", "It", "depicted",
|
||||
"simply", "an", "enormous", "face", ",", "more", "than", "a", "metre",
|
||||
"wide", ":", "the", "face", "of", "a", "\n", "man", "of", "about",
|
||||
"forty", "-", "five", ",", "with", "a", "heavy", "black", "moustache",
|
||||
"and", "ruggedly", "handsome", "\n", "features", ".", "Winston", "made",
|
||||
"for", "the", "stairs", ".", "It", "was", "no", "use", "trying", "the",
|
||||
"lift", ".", "Even", "at", "\n", "the", "best", "of", "times", "it",
|
||||
"was", "seldom", "working", ",", "and", "at", "present", "the",
|
||||
"electric", "current", "\n", "was", "cut", "off", "during", "daylight",
|
||||
"hours", ".", "It", "was", "part", "of", "the", "economy", "drive",
|
||||
"in", "\n", "preparation", "for", "Hate", "Week", ".", "The", "flat",
|
||||
"was", "seven", "flights", "up", ",", "and", "Winston", ",", "who",
|
||||
"\n", "was", "thirty", "-", "nine", "and", "had", "a", "varicose",
|
||||
"ulcer", "above", "his", "right", "ankle", ",", "went", "slowly", ",",
|
||||
"\n", "resting", "several", "times", "on", "the", "way", ".", "On",
|
||||
"each", "landing", ",", "opposite", "the", "lift", "-", "shaft", ",",
|
||||
"\n", "the", "poster", "with", "the", "enormous", "face", "gazed",
|
||||
"from", "the", "wall", ".", "It", "was", "one", "of", "those", "\n",
|
||||
"pictures", "which", "are", "so", "contrived", "that", "the", "eyes",
|
||||
"follow", "you", "about", "when", "you", "move", ".", "\n", "BIG",
|
||||
"BROTHER", "IS", "WATCHING", "YOU", ",", "the", "caption", "beneath",
|
||||
"it", "ran", ".", "\n", ]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def heads():
|
||||
# fmt: off
|
||||
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
|
||||
-1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
|
||||
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
|
||||
1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
|
||||
0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
|
||||
9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
|
||||
2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
|
||||
3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
|
||||
-1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
|
||||
-1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
|
||||
-2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
|
||||
1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
|
||||
1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
|
||||
-10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
|
||||
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
|
||||
1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
|
||||
-1, 0, -1, -1]
|
||||
return [
|
||||
1, 2, 2, 6, 6, 6, 2, 6, 7, 2, 2, 12, 14, 14, 2, 14, 14, 16, 19, 23, 23,
|
||||
22, 23, 23, 23, 26, 24, 23, 29, 27, 31, 29, 35, 32, 35, 31, 23, 23, 37,
|
||||
37, 42, 42, 39, 42, 45, 43, 37, 46, 37, 50, 51, 37, 53, 51, 55, 53, 55,
|
||||
58, 56, 53, 59, 60, 60, 62, 63, 23, 65, 68, 69, 69, 69, 72, 70, 72, 76,
|
||||
76, 72, 69, 96, 80, 78, 80, 81, 86, 83, 86, 96, 96, 89, 96, 89, 92, 90,
|
||||
96, 96, 96, 96, 96, 99, 97, 96, 100, 103, 103, 103, 107, 107, 103, 107,
|
||||
111, 111, 112, 113, 107, 103, 116, 136, 116, 120, 118, 117, 120, 125,
|
||||
125, 125, 121, 116, 116, 131, 131, 131, 127, 131, 134, 131, 134, 136,
|
||||
136, 139, 139, 139, 142, 140, 139, 145, 145, 147, 145, 147, 150, 148,
|
||||
145, 153, 162, 153, 156, 162, 156, 157, 162, 162, 162, 162, 162, 162,
|
||||
172, 165, 169, 169, 172, 169, 172, 162, 172, 172, 176, 174, 172, 179,
|
||||
179, 179, 180, 183, 181, 179, 184, 185, 185, 187, 190, 188, 179, 193,
|
||||
194, 194, 196, 194, 196, 194, 194, 218, 200, 204, 202, 200, 207, 207,
|
||||
204, 204, 204, 212, 212, 209, 212, 216, 216, 213, 200, 194, 218, 218,
|
||||
220, 218, 224, 222, 222, 227, 225, 218, 246, 231, 229, 246, 246, 237,
|
||||
237, 237, 233, 246, 238, 241, 246, 241, 245, 245, 242, 246, 246, 249,
|
||||
247, 246, 252, 252, 252, 253, 257, 255, 254, 259, 257, 261, 259, 265,
|
||||
264, 265, 261, 265, 265, 270, 270, 267, 252, 271, 274, 275, 275, 276,
|
||||
283, 283, 280, 283, 280, 281, 283, 283, 284]
|
||||
# fmt: on
|
||||
|
||||
|
||||
def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
def test_parser_parse_navigate_consistency(en_vocab, words, heads):
|
||||
doc = Doc(en_vocab, words=words, heads=heads)
|
||||
for head in doc:
|
||||
for child in head.lefts:
|
||||
assert child.head == head
|
||||
|
@ -61,15 +77,8 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
|||
assert child.head == head
|
||||
|
||||
|
||||
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=heads,
|
||||
deps=["dep"] * len(heads),
|
||||
)
|
||||
|
||||
def test_parser_parse_navigate_child_consistency(en_vocab, words, heads):
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
|
||||
lefts = {}
|
||||
rights = {}
|
||||
for head in doc:
|
||||
|
@ -99,9 +108,8 @@ def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
|||
assert not children
|
||||
|
||||
|
||||
def test_parser_parse_navigate_edges(en_tokenizer, text, heads):
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
def test_parser_parse_navigate_edges(en_vocab, words, heads):
|
||||
doc = Doc(en_vocab, words=words, heads=heads)
|
||||
for token in doc:
|
||||
subtree = list(token.subtree)
|
||||
debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
|
||||
|
|
|
@ -1,42 +1,40 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
|
||||
from spacy.tokens.doc import Doc
|
||||
|
||||
from ..util import get_doc, apply_transition_sequence
|
||||
from ..util import apply_transition_sequence
|
||||
|
||||
|
||||
def test_parser_space_attachment(en_tokenizer):
|
||||
text = "This is a test.\nTo ensure spaces are attached well."
|
||||
heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
|
||||
def test_parser_space_attachment(en_vocab):
|
||||
# fmt: off
|
||||
words = ["This", "is", "a", "test", ".", "\n", "To", "ensure", " ", "spaces", "are", "attached", "well", "."]
|
||||
heads = [1, 1, 3, 1, 1, 4, 7, 11, 7, 11, 11, 11, 11, 11]
|
||||
# fmt: on
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
for sent in doc.sents:
|
||||
if len(sent) == 1:
|
||||
assert not sent[-1].is_space
|
||||
|
||||
|
||||
def test_parser_sentence_space(en_tokenizer):
|
||||
def test_parser_sentence_space(en_vocab):
|
||||
# fmt: off
|
||||
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
||||
words = ["I", "look", "forward", "to", "using", "Thingamajig", ".", " ", "I", "'ve", "been", "told", "it", "will", "make", "my", "life", "easier", "..."]
|
||||
heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
|
||||
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||
"poss", "nsubj", "ccomp", "punct"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="The step_through API was removed (but should be brought back)"
|
||||
)
|
||||
def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
||||
text = "\t \n This is a sentence ."
|
||||
heads = [1, 1, 0, 1, -2, -3]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
|
||||
def test_parser_space_attachment_leading(en_vocab, en_parser):
|
||||
words = ["\t", "\n", "This", "is", "a", "sentence", "."]
|
||||
heads = [1, 2, 2, 4, 2, 2]
|
||||
doc = Doc(en_vocab, words=words, heads=heads)
|
||||
assert doc[0].is_space
|
||||
assert doc[1].is_space
|
||||
assert doc[2].text == "This"
|
||||
|
@ -50,18 +48,16 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
|||
@pytest.mark.skip(
|
||||
reason="The step_through API was removed (but should be brought back)"
|
||||
)
|
||||
def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
||||
text = "This is \t a \t\n \n sentence . \n\n \n"
|
||||
heads = [1, 0, -1, 2, -1, -4, -5, -1]
|
||||
def test_parser_space_attachment_intermediate_trailing(en_vocab, en_parser):
|
||||
words = ["This", "is", "\t", "a", "\t\n", "\n", "sentence", ".", "\n\n", "\n"]
|
||||
heads = [1, 1, 1, 5, 3, 1, 1, 6]
|
||||
transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
|
||||
doc = Doc(en_vocab, words=words, heads=heads)
|
||||
assert doc[2].is_space
|
||||
assert doc[4].is_space
|
||||
assert doc[5].is_space
|
||||
assert doc[8].is_space
|
||||
assert doc[9].is_space
|
||||
|
||||
apply_transition_sequence(en_parser, doc, transition)
|
||||
for token in doc:
|
||||
assert token.dep != 0 or token.is_space
|
||||
|
@ -72,7 +68,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
|||
@pytest.mark.skip(
|
||||
reason="The step_through API was removed (but should be brought back)"
|
||||
)
|
||||
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
|
||||
def test_parser_space_attachment_space(en_parser, text, length):
|
||||
doc = Doc(en_parser.vocab, words=text)
|
||||
assert len(doc) == length
|
||||
with en_parser.step_through(doc) as _: # noqa: F841
|
||||
|
|
|
@ -4,8 +4,9 @@ from spacy.training import Example
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import AttributeRuler
|
||||
from spacy import util, registry
|
||||
from spacy.tokens import Doc
|
||||
|
||||
from ..util import get_doc, make_tempdir
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -66,7 +67,6 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
|||
a = nlp.add_pipe("attribute_ruler")
|
||||
for p in pattern_dicts:
|
||||
a.add(**p)
|
||||
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
|
@ -129,7 +129,7 @@ def test_attributeruler_rule_order(nlp):
|
|||
{"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}},
|
||||
]
|
||||
a.add_patterns(patterns)
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
nlp.vocab,
|
||||
words=["This", "is", "a", "test", "."],
|
||||
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||
|
@ -141,13 +141,12 @@ def test_attributeruler_rule_order(nlp):
|
|||
def test_attributeruler_tag_map(nlp, tag_map):
|
||||
a = AttributeRuler(nlp.vocab)
|
||||
a.load_from_tag_map(tag_map)
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
nlp.vocab,
|
||||
words=["This", "is", "a", "test", "."],
|
||||
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||
)
|
||||
doc = a(doc)
|
||||
|
||||
for i in range(len(doc)):
|
||||
if i == 4:
|
||||
assert doc[i].pos_ == "PUNCT"
|
||||
|
@ -160,13 +159,12 @@ def test_attributeruler_tag_map(nlp, tag_map):
|
|||
def test_attributeruler_morph_rules(nlp, morph_rules):
|
||||
a = AttributeRuler(nlp.vocab)
|
||||
a.load_from_morph_rules(morph_rules)
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
nlp.vocab,
|
||||
words=["This", "is", "the", "test", "."],
|
||||
tags=["DT", "VBZ", "DT", "NN", "."],
|
||||
)
|
||||
doc = a(doc)
|
||||
|
||||
for i in range(len(doc)):
|
||||
if i != 2:
|
||||
assert doc[i].pos_ == ""
|
||||
|
@ -193,7 +191,6 @@ def test_attributeruler_indices(nlp):
|
|||
|
||||
text = "This is a test."
|
||||
doc = nlp(text)
|
||||
|
||||
for i in range(len(doc)):
|
||||
if i == 1:
|
||||
assert doc[i].lemma_ == "was"
|
||||
|
@ -205,12 +202,10 @@ def test_attributeruler_indices(nlp):
|
|||
assert doc[i].lemma_ == "cat"
|
||||
else:
|
||||
assert doc[i].morph_ == ""
|
||||
|
||||
# raises an error when trying to modify a token outside of the match
|
||||
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
|
||||
with pytest.raises(ValueError):
|
||||
doc = nlp(text)
|
||||
|
||||
# raises an error when trying to modify a token outside of the match
|
||||
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -220,7 +215,6 @@ def test_attributeruler_indices(nlp):
|
|||
def test_attributeruler_patterns_prop(nlp, pattern_dicts):
|
||||
a = nlp.add_pipe("attribute_ruler")
|
||||
a.add_patterns(pattern_dicts)
|
||||
|
||||
for p1, p2 in zip(pattern_dicts, a.patterns):
|
||||
assert p1["patterns"] == p2["patterns"]
|
||||
assert p1["attrs"] == p2["attrs"]
|
||||
|
@ -231,18 +225,15 @@ def test_attributeruler_patterns_prop(nlp, pattern_dicts):
|
|||
def test_attributeruler_serialize(nlp, pattern_dicts):
|
||||
a = nlp.add_pipe("attribute_ruler")
|
||||
a.add_patterns(pattern_dicts)
|
||||
|
||||
text = "This is a test."
|
||||
attrs = ["ORTH", "LEMMA", "MORPH"]
|
||||
doc = nlp(text)
|
||||
|
||||
# bytes roundtrip
|
||||
a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
|
||||
assert a.to_bytes() == a_reloaded.to_bytes()
|
||||
doc1 = a_reloaded(nlp.make_doc(text))
|
||||
numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
|
||||
assert a.patterns == a_reloaded.patterns
|
||||
|
||||
# disk roundtrip
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
|
|
|
@ -1,57 +1,38 @@
|
|||
import pytest
|
||||
from spacy.pipeline.functions import merge_subtokens
|
||||
from spacy.language import Language
|
||||
from spacy.tokens import Span
|
||||
|
||||
from ..util import get_doc
|
||||
from spacy.tokens import Span, Doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
def doc(en_vocab):
|
||||
# fmt: off
|
||||
text = "This is a sentence. This is another sentence. And a third."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
|
||||
words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "sentence", ".", "And", "a", "third", "."]
|
||||
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 11, 12, 13, 13]
|
||||
deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
|
||||
"subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
return Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc2(en_tokenizer):
|
||||
text = "I like New York in Autumn."
|
||||
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||
def doc2(en_vocab):
|
||||
words = ["I", "like", "New", "York", "in", "Autumn", "."]
|
||||
heads = [1, 1, 3, 1, 1, 4, 1]
|
||||
tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
||||
pos = ["PRON", "VERB", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
|
||||
deps = ["ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=heads,
|
||||
tags=tags,
|
||||
pos=pos,
|
||||
deps=deps,
|
||||
)
|
||||
doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
|
||||
doc = Doc(en_vocab, words=words, heads=heads, tags=tags, pos=pos, deps=deps)
|
||||
doc.ents = [Span(doc, 2, 4, label="GPE")]
|
||||
return doc
|
||||
|
||||
|
||||
def test_merge_subtokens(doc):
|
||||
doc = merge_subtokens(doc)
|
||||
# get_doc() doesn't set spaces, so the result is "And a third ."
|
||||
assert [t.text for t in doc] == [
|
||||
"This",
|
||||
"is",
|
||||
"a sentence",
|
||||
".",
|
||||
"This",
|
||||
"is",
|
||||
"another sentence",
|
||||
".",
|
||||
"And a third .",
|
||||
]
|
||||
# Doc doesn't have spaces, so the result is "And a third ."
|
||||
# fmt: off
|
||||
assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]
|
||||
# fmt: on
|
||||
|
||||
|
||||
def test_factories_merge_noun_chunks(doc2):
|
||||
|
|
|
@ -9,7 +9,7 @@ from spacy.lang.en import English
|
|||
from spacy.lookups import Lookups
|
||||
from spacy.tokens import Doc, Span
|
||||
|
||||
from ..util import get_doc, make_tempdir
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -88,12 +88,9 @@ def test_issue242(en_tokenizer):
|
|||
doc.ents += tuple(matches)
|
||||
|
||||
|
||||
def test_issue309(en_tokenizer):
|
||||
def test_issue309(en_vocab):
|
||||
"""Test Issue #309: SBD fails on empty string"""
|
||||
tokens = en_tokenizer(" ")
|
||||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
|
||||
)
|
||||
doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
|
||||
assert len(doc) == 1
|
||||
sents = list(doc.sents)
|
||||
assert len(sents) == 1
|
||||
|
|
|
@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
|
|||
from spacy.attrs import HEAD, DEP
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
from ..util import make_tempdir, get_doc
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue1506():
|
||||
|
@ -197,32 +197,21 @@ def test_issue1807():
|
|||
def test_issue1834():
|
||||
"""Test that sentence boundaries & parse/tag flags are not lost
|
||||
during serialization."""
|
||||
string = "This is a first sentence . And another one"
|
||||
words = string.split()
|
||||
doc = get_doc(Vocab(), words=words)
|
||||
words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
|
||||
doc = Doc(Vocab(), words=words)
|
||||
doc[6].is_sent_start = True
|
||||
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
||||
assert new_doc[6].sent_start
|
||||
assert not new_doc.has_annotation("DEP")
|
||||
assert not new_doc.has_annotation("TAG")
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
Vocab(),
|
||||
words=words,
|
||||
tags=["TAG"] * len(words),
|
||||
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
|
||||
heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
|
||||
deps=["dep"] * len(words),
|
||||
)
|
||||
print(
|
||||
doc.has_annotation("DEP"),
|
||||
[t.head.i for t in doc],
|
||||
[t.is_sent_start for t in doc],
|
||||
)
|
||||
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
||||
print(
|
||||
new_doc.has_annotation("DEP"),
|
||||
[t.head.i for t in new_doc],
|
||||
[t.is_sent_start for t in new_doc],
|
||||
)
|
||||
assert new_doc[6].sent_start
|
||||
assert new_doc.has_annotation("DEP")
|
||||
assert new_doc.has_annotation("TAG")
|
||||
|
|
|
@ -7,7 +7,7 @@ from spacy.training import iob_to_biluo
|
|||
from spacy.lang.it import Italian
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import add_vecs_to_vocab, get_doc
|
||||
from ..util import add_vecs_to_vocab
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
|
@ -69,9 +69,10 @@ def test_issue2219(en_vocab):
|
|||
assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
|
||||
|
||||
|
||||
def test_issue2361(de_tokenizer):
|
||||
def test_issue2361(de_vocab):
|
||||
chars = ("<", ">", "&", """)
|
||||
doc = de_tokenizer('< > & " ')
|
||||
words = ["<", ">", "&", '"']
|
||||
doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
|
||||
html = render(doc)
|
||||
for char in chars:
|
||||
assert char in html
|
||||
|
@ -105,7 +106,7 @@ def test_issue2385_biluo(tags):
|
|||
|
||||
def test_issue2396(en_vocab):
|
||||
words = ["She", "created", "a", "test", "for", "spacy"]
|
||||
heads = [1, 0, 1, -2, -1, -1]
|
||||
heads = [1, 1, 3, 1, 3, 4]
|
||||
deps = ["dep"] * len(heads)
|
||||
matrix = numpy.array(
|
||||
[
|
||||
|
@ -118,7 +119,7 @@ def test_issue2396(en_vocab):
|
|||
],
|
||||
dtype=numpy.int32,
|
||||
)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
span = doc[:]
|
||||
assert (doc.get_lca_matrix() == matrix).all()
|
||||
assert (span.get_lca_matrix() == matrix).all()
|
||||
|
|
|
@ -12,8 +12,6 @@ from spacy.compat import pickle
|
|||
import numpy
|
||||
import random
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_issue2564():
|
||||
"""Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
|
||||
|
@ -117,12 +115,14 @@ def test_issue2754(en_tokenizer):
|
|||
|
||||
def test_issue2772(en_vocab):
|
||||
"""Test that deprojectivization doesn't mess up sentence boundaries."""
|
||||
words = "When we write or communicate virtually , we can hide our true feelings .".split()
|
||||
# fmt: off
|
||||
words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."]
|
||||
# fmt: on
|
||||
# A tree with a non-projective (i.e. crossing) arc
|
||||
# The arcs (0, 4) and (2, 9) cross.
|
||||
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
|
||||
heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9]
|
||||
deps = ["dep"] * len(heads)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert doc[1].is_sent_start is False
|
||||
|
||||
|
||||
|
|
|
@ -10,10 +10,8 @@ from spacy.vocab import Vocab
|
|||
from spacy.attrs import ENT_IOB, ENT_TYPE
|
||||
from spacy.compat import pickle
|
||||
from spacy import displacy
|
||||
import numpy
|
||||
|
||||
from spacy.vectors import Vectors
|
||||
from ..util import get_doc
|
||||
import numpy
|
||||
|
||||
|
||||
def test_issue3002():
|
||||
|
@ -47,7 +45,7 @@ def test_issue3009(en_vocab):
|
|||
words = ["also", "has", "to", "do", "with"]
|
||||
tags = ["RB", "VBZ", "TO", "VB", "IN"]
|
||||
pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
|
||||
doc = get_doc(en_vocab, words=words, tags=tags, pos=pos)
|
||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos)
|
||||
matcher = Matcher(en_vocab)
|
||||
for i, pattern in enumerate(patterns):
|
||||
matcher.add(str(i), [pattern])
|
||||
|
@ -61,19 +59,15 @@ def test_issue3012(en_vocab):
|
|||
words = ["This", "is", "10", "%", "."]
|
||||
tags = ["DT", "VBZ", "CD", "NN", "."]
|
||||
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
||||
ents = [(2, 4, "PERCENT")]
|
||||
doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
||||
ents = [("PERCENT", 2, 4)]
|
||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
||||
assert doc.has_annotation("TAG")
|
||||
|
||||
expected = ("10", "NUM", "CD", "PERCENT")
|
||||
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
||||
|
||||
header = [ENT_IOB, ENT_TYPE]
|
||||
ent_array = doc.to_array(header)
|
||||
doc.from_array(header, ent_array)
|
||||
|
||||
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
||||
|
||||
# Serializing then deserializing
|
||||
doc_bytes = doc.to_bytes()
|
||||
doc2 = Doc(en_vocab).from_bytes(doc_bytes)
|
||||
|
@ -85,12 +79,8 @@ def test_issue3199():
|
|||
is available. To make this test future-proof, we're constructing a Doc
|
||||
with a new Vocab here and a parse tree to make sure the noun chunks run.
|
||||
"""
|
||||
doc = get_doc(
|
||||
Vocab(),
|
||||
words=["This", "is", "a", "sentence"],
|
||||
heads=[0, -1, -2, -3],
|
||||
deps=["dep"] * 4,
|
||||
)
|
||||
words = ["This", "is", "a", "sentence"]
|
||||
doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
|
||||
assert list(doc[0:3].noun_chunks) == []
|
||||
|
||||
|
||||
|
@ -147,9 +137,9 @@ def test_issue3288(en_vocab):
|
|||
"""Test that retokenization works correctly via displaCy when punctuation
|
||||
is merged onto the preceeding token and tensor is resized."""
|
||||
words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
|
||||
heads = [1, 0, -1, 1, 0, 1, -2, -3]
|
||||
heads = [1, 1, 1, 4, 4, 6, 4, 4]
|
||||
deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
|
||||
displacy.render(doc)
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ import spacy
|
|||
import srsly
|
||||
import numpy
|
||||
|
||||
from ..util import make_tempdir, get_doc
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||
|
@ -355,7 +355,7 @@ def test_issue3882(en_vocab):
|
|||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||
copy of the Doc.
|
||||
"""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
|
||||
doc.user_data["test"] = set()
|
||||
parse_deps(doc)
|
||||
|
||||
|
@ -398,10 +398,10 @@ def test_issue3962(en_vocab):
|
|||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||
heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7]
|
||||
deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||
# fmt: on
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
span2 = doc[1:5] # "jests at scars ,"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
|
@ -436,10 +436,10 @@ def test_issue3962_long(en_vocab):
|
|||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||
heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7]
|
||||
deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||
# fmt: on
|
||||
two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
|
|
138
spacy/tests/regression/test_issue5001-5500.py
Normal file
138
spacy/tests/regression/test_issue5001-5500.py
Normal file
|
@ -0,0 +1,138 @@
|
|||
import numpy
|
||||
from spacy.tokens import Doc, DocBin
|
||||
from spacy.attrs import DEP, POS, TAG
|
||||
from spacy.lang.en import English
|
||||
from spacy.language import Language
|
||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||
from spacy.vocab import Vocab
|
||||
import spacy
|
||||
import pytest
|
||||
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
||||
def test_issue5048(en_vocab):
|
||||
words = ["This", "is", "a", "sentence"]
|
||||
pos_s = ["DET", "VERB", "DET", "NOUN"]
|
||||
spaces = [" ", " ", " ", ""]
|
||||
deps_s = ["dep", "adj", "nn", "atm"]
|
||||
tags_s = ["DT", "VBZ", "DT", "NN"]
|
||||
strings = en_vocab.strings
|
||||
for w in words:
|
||||
strings.add(w)
|
||||
deps = [strings.add(d) for d in deps_s]
|
||||
pos = [strings.add(p) for p in pos_s]
|
||||
tags = [strings.add(t) for t in tags_s]
|
||||
attrs = [POS, DEP, TAG]
|
||||
array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
doc.from_array(attrs, array)
|
||||
v1 = [(token.text, token.pos_, token.tag_) for token in doc]
|
||||
doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
|
||||
v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
|
||||
assert v1 == v2
|
||||
|
||||
|
||||
def test_issue5082():
|
||||
# Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
|
||||
nlp = English()
|
||||
vocab = nlp.vocab
|
||||
array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32)
|
||||
array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32)
|
||||
array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32)
|
||||
array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32)
|
||||
array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32)
|
||||
vocab.set_vector("I", array1)
|
||||
vocab.set_vector("like", array2)
|
||||
vocab.set_vector("David", array3)
|
||||
vocab.set_vector("Bowie", array4)
|
||||
text = "I like David Bowie"
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
|
||||
]
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
parsed_vectors_1 = [t.vector for t in nlp(text)]
|
||||
assert len(parsed_vectors_1) == 4
|
||||
numpy.testing.assert_array_equal(parsed_vectors_1[0], array1)
|
||||
numpy.testing.assert_array_equal(parsed_vectors_1[1], array2)
|
||||
numpy.testing.assert_array_equal(parsed_vectors_1[2], array3)
|
||||
numpy.testing.assert_array_equal(parsed_vectors_1[3], array4)
|
||||
nlp.add_pipe("merge_entities")
|
||||
parsed_vectors_2 = [t.vector for t in nlp(text)]
|
||||
assert len(parsed_vectors_2) == 3
|
||||
numpy.testing.assert_array_equal(parsed_vectors_2[0], array1)
|
||||
numpy.testing.assert_array_equal(parsed_vectors_2[1], array2)
|
||||
numpy.testing.assert_array_equal(parsed_vectors_2[2], array34)
|
||||
|
||||
|
||||
def test_issue5137():
|
||||
@Language.factory("my_component")
|
||||
class MyComponent:
|
||||
def __init__(self, nlp, name="my_component", categories="all_categories"):
|
||||
self.nlp = nlp
|
||||
self.categories = categories
|
||||
self.name = name
|
||||
|
||||
def __call__(self, doc):
|
||||
pass
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
pass
|
||||
|
||||
def from_disk(self, path, **cfg):
|
||||
pass
|
||||
|
||||
nlp = English()
|
||||
my_component = nlp.add_pipe("my_component")
|
||||
assert my_component.categories == "all_categories"
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir)
|
||||
overrides = {"components": {"my_component": {"categories": "my_categories"}}}
|
||||
nlp2 = spacy.load(tmpdir, config=overrides)
|
||||
assert nlp2.get_pipe("my_component").categories == "my_categories"
|
||||
|
||||
|
||||
def test_issue5141(en_vocab):
|
||||
""" Ensure an empty DocBin does not crash on serialization """
|
||||
doc_bin = DocBin(attrs=["DEP", "HEAD"])
|
||||
assert list(doc_bin.get_docs(en_vocab)) == []
|
||||
doc_bin_bytes = doc_bin.to_bytes()
|
||||
doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
|
||||
assert list(doc_bin_2.get_docs(en_vocab)) == []
|
||||
|
||||
|
||||
def test_issue5152():
|
||||
# Test that the comparison between a Span and a Token, goes well
|
||||
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
||||
nlp = English()
|
||||
text = nlp("Talk about being boring!")
|
||||
text_var = nlp("Talk of being boring!")
|
||||
y = nlp("Let")
|
||||
span = text[0:3] # Talk about being
|
||||
span_2 = text[0:3] # Talk about being
|
||||
span_3 = text_var[0:3] # Talk of being
|
||||
token = y[0] # Let
|
||||
with pytest.warns(UserWarning):
|
||||
assert span.similarity(token) == 0.0
|
||||
assert span.similarity(span_2) == 1.0
|
||||
with pytest.warns(UserWarning):
|
||||
assert span_2.similarity(span_3) < 1.0
|
||||
|
||||
|
||||
def test_issue5458():
|
||||
# Test that the noun chuncker does not generate overlapping spans
|
||||
# fmt: off
|
||||
words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
|
||||
vocab = Vocab(strings=words)
|
||||
deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
|
||||
pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
|
||||
heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
|
||||
# fmt: on
|
||||
en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps)
|
||||
en_doc.noun_chunks_iterator = noun_chunks
|
||||
|
||||
# if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
|
||||
nlp = English()
|
||||
merge_nps = nlp.create_pipe("merge_noun_chunks")
|
||||
merge_nps(en_doc)
|
|
@ -1,32 +0,0 @@
|
|||
import numpy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.attrs import DEP, POS, TAG
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_issue5048(en_vocab):
|
||||
words = ["This", "is", "a", "sentence"]
|
||||
pos_s = ["DET", "VERB", "DET", "NOUN"]
|
||||
spaces = [" ", " ", " ", ""]
|
||||
deps_s = ["dep", "adj", "nn", "atm"]
|
||||
tags_s = ["DT", "VBZ", "DT", "NN"]
|
||||
|
||||
strings = en_vocab.strings
|
||||
|
||||
for w in words:
|
||||
strings.add(w)
|
||||
deps = [strings.add(d) for d in deps_s]
|
||||
pos = [strings.add(p) for p in pos_s]
|
||||
tags = [strings.add(t) for t in tags_s]
|
||||
|
||||
attrs = [POS, DEP, TAG]
|
||||
array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
|
||||
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
doc.from_array(attrs, array)
|
||||
v1 = [(token.text, token.pos_, token.tag_) for token in doc]
|
||||
|
||||
doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
|
||||
v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
|
||||
assert v1 == v2
|
|
@ -1,37 +0,0 @@
|
|||
import numpy as np
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
def test_issue5082():
|
||||
# Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
|
||||
nlp = English()
|
||||
vocab = nlp.vocab
|
||||
array1 = np.asarray([0.1, 0.5, 0.8], dtype=np.float32)
|
||||
array2 = np.asarray([-0.2, -0.6, -0.9], dtype=np.float32)
|
||||
array3 = np.asarray([0.3, -0.1, 0.7], dtype=np.float32)
|
||||
array4 = np.asarray([0.5, 0, 0.3], dtype=np.float32)
|
||||
array34 = np.asarray([0.4, -0.05, 0.5], dtype=np.float32)
|
||||
|
||||
vocab.set_vector("I", array1)
|
||||
vocab.set_vector("like", array2)
|
||||
vocab.set_vector("David", array3)
|
||||
vocab.set_vector("Bowie", array4)
|
||||
|
||||
text = "I like David Bowie"
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
|
||||
]
|
||||
ruler = nlp.add_pipe("entity_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
parsed_vectors_1 = [t.vector for t in nlp(text)]
|
||||
assert len(parsed_vectors_1) == 4
|
||||
np.testing.assert_array_equal(parsed_vectors_1[0], array1)
|
||||
np.testing.assert_array_equal(parsed_vectors_1[1], array2)
|
||||
np.testing.assert_array_equal(parsed_vectors_1[2], array3)
|
||||
np.testing.assert_array_equal(parsed_vectors_1[3], array4)
|
||||
nlp.add_pipe("merge_entities")
|
||||
parsed_vectors_2 = [t.vector for t in nlp(text)]
|
||||
assert len(parsed_vectors_2) == 3
|
||||
np.testing.assert_array_equal(parsed_vectors_2[0], array1)
|
||||
np.testing.assert_array_equal(parsed_vectors_2[1], array2)
|
||||
np.testing.assert_array_equal(parsed_vectors_2[2], array34)
|
|
@ -1,32 +0,0 @@
|
|||
import spacy
|
||||
from spacy.language import Language
|
||||
from spacy.lang.en import English
|
||||
from spacy.tests.util import make_tempdir
|
||||
|
||||
|
||||
def test_issue5137():
|
||||
@Language.factory("my_component")
|
||||
class MyComponent:
|
||||
def __init__(self, nlp, name="my_component", categories="all_categories"):
|
||||
self.nlp = nlp
|
||||
self.categories = categories
|
||||
self.name = name
|
||||
|
||||
def __call__(self, doc):
|
||||
pass
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
pass
|
||||
|
||||
def from_disk(self, path, **cfg):
|
||||
pass
|
||||
|
||||
nlp = English()
|
||||
my_component = nlp.add_pipe("my_component")
|
||||
assert my_component.categories == "all_categories"
|
||||
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir)
|
||||
overrides = {"components": {"my_component": {"categories": "my_categories"}}}
|
||||
nlp2 = spacy.load(tmpdir, config=overrides)
|
||||
assert nlp2.get_pipe("my_component").categories == "my_categories"
|
|
@ -1,11 +0,0 @@
|
|||
from spacy.tokens import DocBin
|
||||
|
||||
|
||||
def test_issue5141(en_vocab):
|
||||
""" Ensure an empty DocBin does not crash on serialization """
|
||||
doc_bin = DocBin(attrs=["DEP", "HEAD"])
|
||||
assert list(doc_bin.get_docs(en_vocab)) == []
|
||||
doc_bin_bytes = doc_bin.to_bytes()
|
||||
|
||||
doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
|
||||
assert list(doc_bin_2.get_docs(en_vocab)) == []
|
|
@ -1,20 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
import pytest
|
||||
|
||||
|
||||
def test_issue5152():
|
||||
# Test that the comparison between a Span and a Token, goes well
|
||||
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
||||
nlp = English()
|
||||
text = nlp("Talk about being boring!")
|
||||
text_var = nlp("Talk of being boring!")
|
||||
y = nlp("Let")
|
||||
span = text[0:3] # Talk about being
|
||||
span_2 = text[0:3] # Talk about being
|
||||
span_3 = text_var[0:3] # Talk of being
|
||||
token = y[0] # Let
|
||||
with pytest.warns(UserWarning):
|
||||
assert span.similarity(token) == 0.0
|
||||
assert span.similarity(span_2) == 1.0
|
||||
with pytest.warns(UserWarning):
|
||||
assert span_2.similarity(span_3) < 1.0
|
|
@ -1,23 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||
from spacy.tests.util import get_doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_issue5458():
|
||||
# Test that the noun chuncker does not generate overlapping spans
|
||||
# fmt: off
|
||||
words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
|
||||
vocab = Vocab(strings=words)
|
||||
dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
|
||||
pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
|
||||
heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10]
|
||||
# fmt: on
|
||||
|
||||
en_doc = get_doc(vocab, words, pos_tags, heads, dependencies)
|
||||
en_doc.noun_chunks_iterator = noun_chunks
|
||||
|
||||
# if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
|
||||
nlp = English()
|
||||
merge_nps = nlp.create_pipe("merge_noun_chunks")
|
||||
merge_nps(en_doc)
|
|
@ -1,5 +1,6 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import merge_entities
|
||||
import pytest
|
||||
|
||||
|
||||
def test_issue5918():
|
||||
|
@ -22,6 +23,7 @@ def test_issue5918():
|
|||
assert len(doc.ents) == 3
|
||||
# make it so that the third span's head is within the entity (ent_iob=I)
|
||||
# bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
|
||||
doc[29].head = doc[33]
|
||||
with pytest.warns(UserWarning):
|
||||
doc[29].head = doc[33]
|
||||
doc = merge_entities(doc)
|
||||
assert len(doc.ents) == 3
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
import pytest
|
||||
from click import NoSuchOption
|
||||
|
||||
from spacy.training import docs_to_json, biluo_tags_from_offsets
|
||||
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||
from spacy.cli._util import string_to_list
|
||||
from thinc.config import ConfigValidationError
|
||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
||||
from spacy.cli.debug_config import check_section_refs
|
||||
from thinc.config import ConfigValidationError, Config
|
||||
import srsly
|
||||
import os
|
||||
|
||||
from .util import make_tempdir
|
||||
|
||||
|
@ -341,6 +342,24 @@ def test_parse_config_overrides_invalid_2(args):
|
|||
parse_config_overrides(args)
|
||||
|
||||
|
||||
def test_parse_cli_overrides():
|
||||
os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
|
||||
result = parse_config_overrides([])
|
||||
assert len(result) == 4
|
||||
assert result["x.foo"] == "bar"
|
||||
assert result["x.bar"] == 12
|
||||
assert result["x.baz"] is False
|
||||
assert result["y.foo"] == "hello"
|
||||
os.environ[OVERRIDES_ENV_VAR] = "--x"
|
||||
assert parse_config_overrides([], env_var=None) == {}
|
||||
with pytest.raises(SystemExit):
|
||||
parse_config_overrides([])
|
||||
os.environ[OVERRIDES_ENV_VAR] = "hello world"
|
||||
with pytest.raises(SystemExit):
|
||||
parse_config_overrides([])
|
||||
del os.environ[OVERRIDES_ENV_VAR]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||
@pytest.mark.parametrize(
|
||||
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
||||
|
@ -395,3 +414,15 @@ def test_string_to_list(value):
|
|||
def test_string_to_list_intify(value):
|
||||
assert string_to_list(value, intify=False) == ["1", "2", "3"]
|
||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||
|
||||
|
||||
def test_check_section_refs():
|
||||
config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
|
||||
config = Config(config)
|
||||
# Valid section reference
|
||||
check_section_refs(config, ["a.b.c"])
|
||||
# Section that doesn't exist in this config
|
||||
check_section_refs(config, ["x.y.z"])
|
||||
# Invalid section reference
|
||||
with pytest.raises(ConfigValidationError):
|
||||
check_section_refs(config, ["a.b.c", "f.g"])
|
||||
|
|
|
@ -1,15 +1,13 @@
|
|||
import pytest
|
||||
from spacy import displacy
|
||||
from spacy.displacy.render import DependencyRenderer, EntityRenderer
|
||||
from spacy.tokens import Span
|
||||
from spacy.tokens import Span, Doc
|
||||
from spacy.lang.fa import Persian
|
||||
|
||||
from .util import get_doc
|
||||
|
||||
|
||||
def test_displacy_parse_ents(en_vocab):
|
||||
"""Test that named entities on a Doc are converted into displaCy's format."""
|
||||
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
||||
ents = displacy.parse_ents(doc)
|
||||
assert isinstance(ents, dict)
|
||||
|
@ -20,11 +18,11 @@ def test_displacy_parse_ents(en_vocab):
|
|||
def test_displacy_parse_deps(en_vocab):
|
||||
"""Test that deps and tags on a Doc are converted into displaCy's format."""
|
||||
words = ["This", "is", "a", "sentence"]
|
||||
heads = [1, 0, 1, -2]
|
||||
heads = [1, 1, 3, 1]
|
||||
pos = ["DET", "VERB", "DET", "NOUN"]
|
||||
tags = ["DT", "VBZ", "DT", "NN"]
|
||||
deps = ["nsubj", "ROOT", "det", "attr"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
|
||||
doc = Doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
|
||||
deps = displacy.parse_deps(doc)
|
||||
assert isinstance(deps, dict)
|
||||
assert deps["words"] == [
|
||||
|
@ -53,7 +51,7 @@ def test_displacy_invalid_arcs():
|
|||
|
||||
def test_displacy_spans(en_vocab):
|
||||
"""Test that displaCy can render Spans."""
|
||||
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
||||
html = displacy.render(doc[1:4], style="ent")
|
||||
assert html.startswith("<div")
|
||||
|
@ -70,9 +68,9 @@ def test_displacy_rtl():
|
|||
# These are (likely) wrong, but it's just for testing
|
||||
pos = ["PRO", "ADV", "N_PL", "V_SUB"] # needs to match lang.fa.tag_map
|
||||
deps = ["foo", "bar", "foo", "baz"]
|
||||
heads = [1, 0, 1, -2]
|
||||
heads = [1, 0, 3, 1]
|
||||
nlp = Persian()
|
||||
doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
|
||||
doc = Doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
|
||||
doc.ents = [Span(doc, 1, 3, label="TEST")]
|
||||
html = displacy.render(doc, page=True, style="dep")
|
||||
assert "direction: rtl" in html
|
||||
|
@ -90,7 +88,7 @@ def test_displacy_render_wrapper(en_vocab):
|
|||
return "TEST" + html + "TEST"
|
||||
|
||||
displacy.set_render_wrapper(wrapper)
|
||||
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert html.startswith("TEST<div")
|
||||
|
|
|
@ -5,7 +5,6 @@ from spacy.training import Example
|
|||
from spacy.training.iob_utils import biluo_tags_from_offsets
|
||||
from spacy.scorer import Scorer, ROCAUCScore
|
||||
from spacy.scorer import _roc_auc_score, _roc_curve
|
||||
from .util import get_doc
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
@ -137,11 +136,8 @@ def test_las_per_type(en_vocab):
|
|||
scorer = Scorer()
|
||||
examples = []
|
||||
for input_, annot in test_las_apple:
|
||||
doc = get_doc(
|
||||
en_vocab,
|
||||
words=input_.split(" "),
|
||||
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
||||
deps=annot["deps"],
|
||||
doc = Doc(
|
||||
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
|
||||
)
|
||||
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
||||
example = Example.from_dict(doc, gold)
|
||||
|
@ -161,11 +157,8 @@ def test_las_per_type(en_vocab):
|
|||
scorer = Scorer()
|
||||
examples = []
|
||||
for input_, annot in test_las_apple:
|
||||
doc = get_doc(
|
||||
en_vocab,
|
||||
words=input_.split(" "),
|
||||
heads=([h - i for i, h in enumerate(annot["heads"])]),
|
||||
deps=annot["deps"],
|
||||
doc = Doc(
|
||||
en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
|
||||
)
|
||||
gold = {"heads": annot["heads"], "deps": annot["deps"]}
|
||||
doc[0].dep_ = "compound"
|
||||
|
@ -188,10 +181,10 @@ def test_ner_per_type(en_vocab):
|
|||
scorer = Scorer()
|
||||
examples = []
|
||||
for input_, annot in test_ner_cardinal:
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
en_vocab,
|
||||
words=input_.split(" "),
|
||||
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
|
||||
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
|
||||
)
|
||||
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
||||
example = Example.from_dict(doc, {"entities": entities})
|
||||
|
@ -213,10 +206,10 @@ def test_ner_per_type(en_vocab):
|
|||
scorer = Scorer()
|
||||
examples = []
|
||||
for input_, annot in test_ner_apple:
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
en_vocab,
|
||||
words=input_.split(" "),
|
||||
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
|
||||
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
|
||||
)
|
||||
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
||||
example = Example.from_dict(doc, {"entities": entities})
|
||||
|
|
|
@ -12,13 +12,14 @@ from thinc.api import compounding
|
|||
import pytest
|
||||
import srsly
|
||||
|
||||
from ..util import make_tempdir, get_doc
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc():
|
||||
def doc(en_vocab):
|
||||
nlp = English() # make sure we get a new vocab every time
|
||||
# fmt: off
|
||||
text = "Sarah's sister flew to Silicon Valley via London."
|
||||
words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
|
||||
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
||||
pos = ["PROPN", "PART", "NOUN", "VERB", "ADP", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
|
||||
morphs = ["NounType=prop|Number=sing", "Poss=yes", "Number=sing", "Tense=past|VerbForm=fin",
|
||||
|
@ -26,15 +27,12 @@ def doc():
|
|||
"NounType=prop|Number=sing", "PunctType=peri"]
|
||||
# head of '.' is intentionally nonprojective for testing
|
||||
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
||||
heads = [head - i for i, head in enumerate(heads)]
|
||||
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
||||
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
|
||||
ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
|
||||
ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
|
||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||
# fmt: on
|
||||
nlp = English()
|
||||
words = [t.text for t in nlp.make_doc(text)]
|
||||
doc = get_doc(
|
||||
doc = Doc(
|
||||
nlp.vocab,
|
||||
words=words,
|
||||
tags=tags,
|
||||
|
@ -212,41 +210,24 @@ def test_json2docs_no_ner(en_vocab):
|
|||
|
||||
|
||||
def test_split_sentences(en_vocab):
|
||||
# fmt: off
|
||||
words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
gold_words = [
|
||||
"I",
|
||||
"flew",
|
||||
"to",
|
||||
"San",
|
||||
"Francisco",
|
||||
"Valley",
|
||||
"had",
|
||||
"loads",
|
||||
"of",
|
||||
"fun",
|
||||
]
|
||||
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of", "fun"]
|
||||
sent_starts = [True, False, False, False, False, False, True, False, False, False]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
|
||||
assert example.text == "I flew to San Francisco Valley had loads of fun "
|
||||
split_examples = example.split_sents()
|
||||
assert len(split_examples) == 2
|
||||
assert split_examples[0].text == "I flew to San Francisco Valley "
|
||||
assert split_examples[1].text == "had loads of fun "
|
||||
|
||||
# fmt: off
|
||||
words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
gold_words = [
|
||||
"I",
|
||||
"flew",
|
||||
"to",
|
||||
"San Francisco",
|
||||
"Valley",
|
||||
"had",
|
||||
"loads of",
|
||||
"fun",
|
||||
]
|
||||
gold_words = ["I", "flew", "to", "San Francisco", "Valley", "had", "loads of", "fun"]
|
||||
sent_starts = [True, False, False, False, False, True, False, False]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
|
||||
assert example.text == "I flew to San Francisco Valley had loads of fun "
|
||||
split_examples = example.split_sents()
|
||||
|
@ -479,7 +460,6 @@ def test_roundtrip_docs_to_docbin(doc):
|
|||
heads = [t.head.i for t in doc]
|
||||
cats = doc.cats
|
||||
ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
|
||||
# roundtrip to DocBin
|
||||
with make_tempdir() as tmpdir:
|
||||
# use a separate vocab to test that all labels are added
|
||||
|
@ -600,7 +580,6 @@ def test_tuple_format_implicit():
|
|||
|
||||
def test_tuple_format_implicit_invalid():
|
||||
"""Test that an error is thrown for an implicit invalid field"""
|
||||
|
||||
train_data = [
|
||||
("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
|
||||
(
|
||||
|
@ -609,7 +588,6 @@ def test_tuple_format_implicit_invalid():
|
|||
),
|
||||
("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
|
||||
]
|
||||
|
||||
with pytest.raises(KeyError):
|
||||
_train_tuples(train_data)
|
||||
|
||||
|
@ -619,11 +597,9 @@ def _train_tuples(train_data):
|
|||
ner = nlp.add_pipe("ner")
|
||||
ner.add_label("ORG")
|
||||
ner.add_label("LOC")
|
||||
|
||||
train_examples = []
|
||||
for t in train_data:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
|
@ -639,17 +615,14 @@ def test_split_sents(merged_dict):
|
|||
merged_dict,
|
||||
)
|
||||
assert example.text == "Hi there everyone It is just me"
|
||||
|
||||
split_examples = example.split_sents()
|
||||
assert len(split_examples) == 2
|
||||
assert split_examples[0].text == "Hi there everyone "
|
||||
assert split_examples[1].text == "It is just me"
|
||||
|
||||
token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
|
||||
assert token_annotation_1["ORTH"] == ["Hi", "there", "everyone"]
|
||||
assert token_annotation_1["TAG"] == ["INTJ", "ADV", "PRON"]
|
||||
assert token_annotation_1["SENT_START"] == [1, 0, 0]
|
||||
|
||||
token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
|
||||
assert token_annotation_2["ORTH"] == ["It", "is", "just", "me"]
|
||||
assert token_annotation_2["TAG"] == ["PRON", "AUX", "ADV", "PRON"]
|
||||
|
|
|
@ -2,11 +2,7 @@ import numpy
|
|||
import tempfile
|
||||
import contextlib
|
||||
import srsly
|
||||
|
||||
from spacy import Errors
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
|
||||
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.util import make_tempdir # noqa: F401
|
||||
|
||||
|
@ -18,74 +14,6 @@ def make_tempfile(mode="r"):
|
|||
f.close()
|
||||
|
||||
|
||||
def get_doc(
|
||||
vocab,
|
||||
words=[],
|
||||
pos=None,
|
||||
heads=None,
|
||||
deps=None,
|
||||
tags=None,
|
||||
ents=None,
|
||||
lemmas=None,
|
||||
morphs=None,
|
||||
):
|
||||
"""Create Doc object from given vocab, words and annotations."""
|
||||
if deps and not heads:
|
||||
heads = [0] * len(deps)
|
||||
headings = []
|
||||
values = []
|
||||
annotations = [pos, heads, deps, lemmas, tags, morphs]
|
||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
|
||||
for a, annot in enumerate(annotations):
|
||||
if annot is not None:
|
||||
if len(annot) != len(words):
|
||||
raise ValueError(Errors.E189)
|
||||
headings.append(possible_headings[a])
|
||||
if annot is not heads:
|
||||
values.extend(annot)
|
||||
for value in values:
|
||||
vocab.strings.add(value)
|
||||
|
||||
doc = Doc(vocab, words=words)
|
||||
|
||||
# if there are any other annotations, set them
|
||||
if headings:
|
||||
attrs = doc.to_array(headings)
|
||||
|
||||
j = 0
|
||||
for annot in annotations:
|
||||
if annot:
|
||||
if annot is heads:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = heads[i]
|
||||
else:
|
||||
attrs[i, j] = heads[i]
|
||||
elif annot is morphs:
|
||||
for i in range(len(words)):
|
||||
morph_key = vocab.morphology.add(morphs[i])
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = morph_key
|
||||
else:
|
||||
attrs[i, j] = morph_key
|
||||
else:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = doc.vocab.strings[annot[i]]
|
||||
else:
|
||||
attrs[i, j] = doc.vocab.strings[annot[i]]
|
||||
j += 1
|
||||
doc.from_array(headings, attrs)
|
||||
|
||||
# finally, set the entities
|
||||
if ents:
|
||||
doc.ents = [
|
||||
Span(doc, start, end, label=doc.vocab.strings[label])
|
||||
for start, end, label in ents
|
||||
]
|
||||
return doc
|
||||
|
||||
|
||||
def get_batch(batch_size):
|
||||
vocab = Vocab()
|
||||
docs = []
|
||||
|
|
|
@ -170,17 +170,50 @@ cdef class Doc:
|
|||
raise ValueError(Errors.E046.format(name=name))
|
||||
return Underscore.doc_extensions.pop(name)
|
||||
|
||||
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
|
||||
def __init__(
|
||||
self,
|
||||
Vocab vocab,
|
||||
words=None,
|
||||
spaces=None,
|
||||
*,
|
||||
user_data=None,
|
||||
tags=None,
|
||||
pos=None,
|
||||
morphs=None,
|
||||
lemmas=None,
|
||||
heads=None,
|
||||
deps=None,
|
||||
sent_starts=None,
|
||||
ents=None,
|
||||
):
|
||||
"""Create a Doc object.
|
||||
|
||||
vocab (Vocab): A vocabulary object, which must match any models you
|
||||
want to use (e.g. tokenizer, parser, entity recognizer).
|
||||
words (list or None): A list of unicode strings to add to the document
|
||||
words (Optional[List[str]]): A list of unicode strings to add to the document
|
||||
as words. If `None`, defaults to empty list.
|
||||
spaces (list or None): A list of boolean values, of the same length as
|
||||
spaces (Optional[List[bool]]): A list of boolean values, of the same length as
|
||||
words. True means that the word is followed by a space, False means
|
||||
it is not. If `None`, defaults to `[True]*len(words)`
|
||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||
tags (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.tag. Defaults to None.
|
||||
pos (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.pos. Defaults to None.
|
||||
morphs (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.morph. Defaults to None.
|
||||
lemmas (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.lemma. Defaults to None.
|
||||
heads (Optional[List[int]]): A list of values, of the same length as
|
||||
words, to assign as heads. Head indices are the position of the
|
||||
head in the doc. Defaults to None.
|
||||
deps (Optional[List[str]]): A list of unicode strings, of the same
|
||||
length as words, to assign as token.dep. Defaults to None.
|
||||
sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
|
||||
the same length as words, to assign as token.is_sent_start. Will be
|
||||
overridden by heads if heads is provided. Defaults to None.
|
||||
ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
|
||||
(label, start, end) tuples to assign as doc.ents. Defaults to None.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/doc#init
|
||||
"""
|
||||
|
@ -229,6 +262,63 @@ cdef class Doc:
|
|||
lexeme = self.vocab.get_by_orth(self.mem, word)
|
||||
self.push_back(lexeme, has_space)
|
||||
|
||||
if heads is not None:
|
||||
heads = [head - i for i, head in enumerate(heads)]
|
||||
if deps and not heads:
|
||||
heads = [0] * len(deps)
|
||||
if sent_starts is not None:
|
||||
for i in range(len(sent_starts)):
|
||||
if sent_starts[i] is True:
|
||||
sent_starts[i] = 1
|
||||
elif sent_starts[i] is False:
|
||||
sent_starts[i] = -1
|
||||
elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
|
||||
sent_starts[i] = 0
|
||||
headings = []
|
||||
values = []
|
||||
annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
|
||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
|
||||
for a, annot in enumerate(annotations):
|
||||
if annot is not None:
|
||||
if len(annot) != len(words):
|
||||
raise ValueError(Errors.E189)
|
||||
headings.append(possible_headings[a])
|
||||
if annot is not heads and annot is not sent_starts:
|
||||
values.extend(annot)
|
||||
for value in values:
|
||||
self.vocab.strings.add(value)
|
||||
|
||||
# if there are any other annotations, set them
|
||||
if headings:
|
||||
attrs = self.to_array(headings)
|
||||
|
||||
j = 0
|
||||
for annot in annotations:
|
||||
if annot:
|
||||
if annot is heads or annot is sent_starts:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = annot[i]
|
||||
else:
|
||||
attrs[i, j] = annot[i]
|
||||
elif annot is morphs:
|
||||
for i in range(len(words)):
|
||||
morph_key = vocab.morphology.add(morphs[i])
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = morph_key
|
||||
else:
|
||||
attrs[i, j] = morph_key
|
||||
else:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = self.vocab.strings[annot[i]]
|
||||
else:
|
||||
attrs[i, j] = self.vocab.strings[annot[i]]
|
||||
j += 1
|
||||
self.from_array(headings, attrs)
|
||||
if ents is not None:
|
||||
self.ents = ents
|
||||
|
||||
@property
|
||||
def _(self):
|
||||
"""Custom extension attributes registered via `set_extension`."""
|
||||
|
@ -585,11 +675,14 @@ cdef class Doc:
|
|||
tokens_in_ents = {}
|
||||
cdef attr_t entity_type
|
||||
cdef attr_t kb_id
|
||||
cdef int ent_start, ent_end
|
||||
cdef int ent_start, ent_end, token_index
|
||||
for ent_info in ents:
|
||||
entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
|
||||
entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
|
||||
if isinstance(entity_type_, str):
|
||||
self.vocab.strings.add(entity_type_)
|
||||
entity_type = self.vocab.strings.as_int(entity_type_)
|
||||
for token_index in range(ent_start, ent_end):
|
||||
if token_index in tokens_in_ents.keys():
|
||||
if token_index in tokens_in_ents:
|
||||
raise ValueError(Errors.E103.format(
|
||||
span1=(tokens_in_ents[token_index][0],
|
||||
tokens_in_ents[token_index][1],
|
||||
|
|
|
@ -199,13 +199,17 @@ def doc_from_conllu_sentence(
|
|||
heads.append(head)
|
||||
deps.append(dep)
|
||||
|
||||
doc = Doc(vocab, words=words, spaces=spaces)
|
||||
doc = Doc(
|
||||
vocab,
|
||||
words=words,
|
||||
spaces=spaces,
|
||||
tags=tags,
|
||||
pos=poses,
|
||||
deps=deps,
|
||||
lemmas=lemmas,
|
||||
heads=heads,
|
||||
)
|
||||
for i in range(len(doc)):
|
||||
doc[i].tag_ = tags[i]
|
||||
doc[i].pos_ = poses[i]
|
||||
doc[i].dep_ = deps[i]
|
||||
doc[i].lemma_ = lemmas[i]
|
||||
doc[i].head = doc[heads[i]]
|
||||
doc[i]._.merged_orth = words[i]
|
||||
doc[i]._.merged_morph = morphs[i]
|
||||
doc[i]._.merged_lemma = lemmas[i]
|
||||
|
@ -232,14 +236,17 @@ def doc_from_conllu_sentence(
|
|||
heads.append(t.head.i)
|
||||
deps.append(t.dep_)
|
||||
|
||||
doc_x = Doc(vocab, words=words, spaces=spaces)
|
||||
for i in range(len(doc)):
|
||||
doc_x[i].tag_ = tags[i]
|
||||
doc_x[i].morph_ = morphs[i]
|
||||
doc_x[i].lemma_ = lemmas[i]
|
||||
doc_x[i].pos_ = poses[i]
|
||||
doc_x[i].dep_ = deps[i]
|
||||
doc_x[i].head = doc_x[heads[i]]
|
||||
doc_x = Doc(
|
||||
vocab,
|
||||
words=words,
|
||||
spaces=spaces,
|
||||
tags=tags,
|
||||
morphs=morphs,
|
||||
lemmas=lemmas,
|
||||
pos=poses,
|
||||
deps=deps,
|
||||
heads=heads,
|
||||
)
|
||||
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
||||
|
||||
return doc_x
|
||||
|
|
|
@ -221,7 +221,7 @@ cdef class Example:
|
|||
def split_sents(self):
|
||||
""" Split the token annotations into multiple Examples based on
|
||||
sent_starts and return a list of the new Examples"""
|
||||
if not self.reference.is_sentenced:
|
||||
if not self.reference.has_annotation("SENT_START"):
|
||||
return [self]
|
||||
|
||||
align = self.alignment.y2x
|
||||
|
|
|
@ -25,16 +25,27 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
|||
>
|
||||
> # Construction 2
|
||||
> from spacy.tokens import Doc
|
||||
>
|
||||
> words = ["hello", "world", "!"]
|
||||
> spaces = [True, False, False]
|
||||
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `lemmas` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `heads` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
|
||||
| `deps` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~ |
|
||||
| `ents` <Tag variant="new">3</Tag> | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
|
||||
|
||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
||||
|
@ -187,8 +198,8 @@ Remove a previously registered extension.
|
|||
## Doc.char_span {#char_span tag="method" new="2"}
|
||||
|
||||
Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
|
||||
`None` if the character indices don't map to a valid span using the default mode
|
||||
`"strict".
|
||||
`None` if the character indices don't map to a valid span using the default
|
||||
alignment mode `"strict".
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -198,15 +209,15 @@ Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"inside"` (span of all tokens completely within the character span), `"outside"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Doc.similarity {#similarity tag="method" model="vectors"}
|
||||
|
||||
|
@ -271,6 +282,19 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
|
|||
|
||||
Check whether the doc contains annotation on a token attribute.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
This method replaces the previous boolean attributes like `Doc.is_tagged`,
|
||||
`Doc.is_parsed` or `Doc.is_sentenced`.
|
||||
|
||||
```diff
|
||||
doc = nlp("This is a text")
|
||||
- assert doc.is_parsed
|
||||
+ assert doc.has_annotation("DEP")
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | --------------------------------------------------------------------------------------------------- |
|
||||
| `attr` | The attribute string name or int ID. ~~Union[int, str]~~ |
|
||||
|
|
|
@ -187,7 +187,7 @@ more efficient than processing texts one-by-one.
|
|||
> ```python
|
||||
> texts = ["One document.", "...", "Lots of documents"]
|
||||
> for doc in nlp.pipe(texts, batch_size=50):
|
||||
> assert doc.is_parsed
|
||||
> assert doc.has_annotation("DEP")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
|
|
@ -65,22 +65,22 @@ Matchers help you find and extract information from [`Doc`](/api/doc) objects
|
|||
based on match patterns describing the sequences you're looking for. A matcher
|
||||
operates on a `Doc` and gives you access to the matched tokens **in context**.
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. |
|
||||
| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. |
|
||||
| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. |
|
||||
| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. |
|
||||
| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using [Semgrex operators](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
|
||||
|
||||
### Other classes {#architecture-other}
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
|
||||
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
|
||||
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
|
||||
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
||||
| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
|
||||
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
||||
| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
|
||||
| [`Scorer`](/api/scorer) | Compute evaluation scores. |
|
||||
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
|
||||
| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
|
||||
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
|
||||
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
|
||||
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
||||
| [`Morphology`](/api/morphology) | Store morphological analyses and map them to and from hash values. |
|
||||
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
||||
| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
|
||||
| [`Scorer`](/api/scorer) | Compute evaluation scores. |
|
||||
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
||||
|
|
|
@ -205,9 +205,10 @@ acquired from [WordNet](https://wordnet.princeton.edu/).
|
|||
spaCy features a fast and accurate syntactic dependency parser, and has a rich
|
||||
API for navigating the tree. The parser also powers the sentence boundary
|
||||
detection, and lets you iterate over base noun phrases, or "chunks". You can
|
||||
check whether a [`Doc`](/api/doc) object has been parsed with the
|
||||
`doc.is_parsed` attribute, which returns a boolean value. If this attribute is
|
||||
`False`, the default sentence iterator will raise an exception.
|
||||
check whether a [`Doc`](/api/doc) object has been parsed by calling
|
||||
`doc.has_annotation("DEP")`, which checks whether the attribute `Token.dep` has
|
||||
been set returns a boolean value. If the result is `False`, the default sentence
|
||||
iterator will raise an exception.
|
||||
|
||||
<Infobox title="Dependency label scheme" emoji="📖">
|
||||
|
||||
|
@ -1705,9 +1706,10 @@ and can still be overwritten by the parser.
|
|||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
To prevent inconsistent state, you can only set boundaries **before** a document
|
||||
is parsed (and `doc.is_parsed` is `False`). To ensure that your component is
|
||||
added in the right place, you can set `before='parser'` or `first=True` when
|
||||
adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
||||
is parsed (and `doc.has_annotation("DEP")` is `False`). To ensure that your
|
||||
component is added in the right place, you can set `before='parser'` or
|
||||
`first=True` when adding it to the pipeline using
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
|
@ -299,9 +299,10 @@ installed in the same environment – that's it.
|
|||
|
||||
When you load a pipeline, spaCy will generally use its `config.cfg` to set up
|
||||
the language class and construct the pipeline. The pipeline is specified as a
|
||||
list of strings, e.g. `pipeline = ["tagger", "paser", "ner"]`. For each of those
|
||||
strings, spaCy will call `nlp.add_pipe` and look up the name in all factories
|
||||
defined by the decorators [`@Language.component`](/api/language#component) and
|
||||
list of strings, e.g. `pipeline = ["tagger", "parser", "ner"]`. For each of
|
||||
those strings, spaCy will call `nlp.add_pipe` and look up the name in all
|
||||
factories defined by the decorators
|
||||
[`@Language.component`](/api/language#component) and
|
||||
[`@Language.factory`](/api/language#factory). This means that you have to import
|
||||
your custom components _before_ loading the pipeline.
|
||||
|
||||
|
|
|
@ -214,6 +214,24 @@ overrides. Overrides are added before [variables](#config-interpolation) are
|
|||
resolved, by the way – so if you need to use a value in multiple places,
|
||||
reference it across your config and override it on the CLI once.
|
||||
|
||||
> #### 💡 Tip: Verbose logging
|
||||
>
|
||||
> If you're using config overrides, you can set the `--verbose` flag on
|
||||
> [`spacy train`](/api/cli#train) to make spaCy log more info, including which
|
||||
> overrides were set via the CLI and environment variables.
|
||||
|
||||
#### Adding overrides via environment variables {#config-overrides-env}
|
||||
|
||||
Instead of defining the overrides as CLI arguments, you can also use the
|
||||
`SPACY_CONFIG_OVERRIDES` environment variable using the same argument syntax.
|
||||
This is especially useful if you're training models as part of an automated
|
||||
process. Environment variables **take precedence** over CLI overrides and values
|
||||
defined in the config file.
|
||||
|
||||
```cli
|
||||
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
|
||||
```
|
||||
|
||||
### Defining pipeline components {#config-components}
|
||||
|
||||
You typically train a [pipeline](/usage/processing-pipelines) of **one or more
|
||||
|
|
|
@ -530,6 +530,8 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
|||
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
|
||||
patterns as the second argument (instead of a variable number of arguments).
|
||||
The `on_match` callback becomes an optional keyword argument.
|
||||
- The `Doc` flags like `Doc.is_parsed` or `Doc.is_tagged` have been replaced by
|
||||
[`Doc.has_annotation`](/api/doc#has_annotation).
|
||||
- The `spacy.gold` module has been renamed to
|
||||
[`spacy.training`](%%GITHUB_SPACY/spacy/training).
|
||||
- The `PRON_LEMMA` symbol and `-PRON-` as an indicator for pronoun lemmas has
|
||||
|
@ -807,10 +809,11 @@ nlp = spacy.blank("en")
|
|||
|
||||
### Migrating Doc flags {#migrating-doc-flags}
|
||||
|
||||
The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
|
||||
`Doc.is_sentenced` are deprecated in v3 and replaced by
|
||||
The [`Doc`](/api/doc) flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
|
||||
`Doc.is_sentenced` are deprecated in v3.0 and replaced by
|
||||
[`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
|
||||
token attribute symbols (the same symbols used in `Matcher` patterns):
|
||||
token attribute symbols (the same symbols used in [`Matcher`](/api/matcher)
|
||||
patterns):
|
||||
|
||||
```diff
|
||||
doc = nlp(text)
|
||||
|
|
|
@ -75,63 +75,64 @@
|
|||
{
|
||||
"label": "Containers",
|
||||
"items": [
|
||||
{ "text": "Language", "url": "/api/language" },
|
||||
{ "text": "Doc", "url": "/api/doc" },
|
||||
{ "text": "Token", "url": "/api/token" },
|
||||
{ "text": "Span", "url": "/api/span" },
|
||||
{ "text": "Lexeme", "url": "/api/lexeme" },
|
||||
{ "text": "DocBin", "url": "/api/docbin" },
|
||||
{ "text": "Example", "url": "/api/example" },
|
||||
{ "text": "DocBin", "url": "/api/docbin" }
|
||||
{ "text": "Language", "url": "/api/language" },
|
||||
{ "text": "Lexeme", "url": "/api/lexeme" },
|
||||
{ "text": "Span", "url": "/api/span" },
|
||||
{ "text": "Token", "url": "/api/token" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "Pipeline",
|
||||
"items": [
|
||||
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
||||
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
|
||||
{ "text": "Transformer", "url": "/api/transformer" },
|
||||
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
||||
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
||||
{ "text": "Tagger", "url": "/api/tagger" },
|
||||
{ "text": "AttributeRuler", "url": "/api/attributeruler" },
|
||||
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
|
||||
{ "text": "EntityLinker", "url": "/api/entitylinker" },
|
||||
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
||||
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
||||
{ "text": "EntityLinker", "url": "/api/entitylinker" },
|
||||
{ "text": "TextCategorizer", "url": "/api/textcategorizer" },
|
||||
{ "text": "Sentencizer", "url": "/api/sentencizer" },
|
||||
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
||||
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
||||
{ "text": "Pipe", "url": "/api/pipe" },
|
||||
{ "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
|
||||
{ "text": "Other Functions", "url": "/api/pipeline-functions" },
|
||||
{ "text": "Pipe", "url": "/api/pipe" }
|
||||
{ "text": "Sentencizer", "url": "/api/sentencizer" },
|
||||
{ "text": "Tagger", "url": "/api/tagger" },
|
||||
{ "text": "TextCategorizer", "url": "/api/textcategorizer" },
|
||||
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
|
||||
{ "text": "Tokenizer", "url": "/api/tokenizer" },
|
||||
{ "text": "Transformer", "url": "/api/transformer" },
|
||||
{ "text": "Other Functions", "url": "/api/pipeline-functions" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "Matchers",
|
||||
"items": [
|
||||
{ "text": "DependencyMatcher", "url": "/api/dependencymatcher" },
|
||||
{ "text": "Matcher", "url": "/api/matcher" },
|
||||
{ "text": "PhraseMatcher", "url": "/api/phrasematcher" },
|
||||
{ "text": "DependencyMatcher", "url": "/api/dependencymatcher" }
|
||||
{ "text": "PhraseMatcher", "url": "/api/phrasematcher" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "Other",
|
||||
"items": [
|
||||
{ "text": "Vocab", "url": "/api/vocab" },
|
||||
{ "text": "Corpus", "url": "/api/corpus" },
|
||||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
||||
{ "text": "Lookups", "url": "/api/lookups" },
|
||||
{ "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
|
||||
{ "text": "Morphology", "url": "/api/morphology" },
|
||||
{ "text": "Scorer", "url": "/api/scorer" },
|
||||
{ "text": "StringStore", "url": "/api/stringstore" },
|
||||
{ "text": "Vectors", "url": "/api/vectors" },
|
||||
{ "text": "Lookups", "url": "/api/lookups" },
|
||||
{ "text": "Morphology", "url": "/api/morphology" },
|
||||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
||||
{ "text": "Scorer", "url": "/api/scorer" },
|
||||
{ "text": "Corpus", "url": "/api/corpus" }
|
||||
{ "text": "Vocab", "url": "/api/vocab" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "Cython",
|
||||
"items": [
|
||||
{ "text": "Architecture", "url": "/api/cython" },
|
||||
{ "text": "Structs", "url": "/api/cython-structs" },
|
||||
{ "text": "Classes", "url": "/api/cython-classes" }
|
||||
{ "text": "Classes", "url": "/api/cython-classes" },
|
||||
{ "text": "Structs", "url": "/api/cython-structs" }
|
||||
]
|
||||
}
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue
Block a user