mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Tidy up and auto-format
This commit is contained in:
parent
2a4d56e730
commit
e68459296d
|
@ -7,8 +7,6 @@ import typer
|
||||||
|
|
||||||
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
|
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..lang.en import English
|
|
||||||
from ..util import dot_to_object
|
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command("model")
|
@debug_cli.command("model")
|
||||||
|
@ -130,8 +128,8 @@ def _sentences():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def _get_docs():
|
def _get_docs(lang: str = "en"):
|
||||||
nlp = English()
|
nlp = util.get_lang_class(lang)()
|
||||||
return list(nlp.pipe(_sentences()))
|
return list(nlp.pipe(_sentences()))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from typing import Optional, List, Dict
|
from typing import Optional, List, Dict
|
||||||
from timeit import default_timer as timer
|
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import tqdm
|
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import requests
|
import requests
|
||||||
|
|
|
@ -1,14 +1,8 @@
|
||||||
from .corpus import Corpus
|
from .corpus import Corpus # noqa: F401
|
||||||
from .example import Example
|
from .example import Example # noqa: F401
|
||||||
from .align import Alignment
|
from .align import Alignment # noqa: F401
|
||||||
|
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||||
from .iob_utils import iob_to_biluo, biluo_to_iob
|
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
|
||||||
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401
|
||||||
from .iob_utils import spans_from_biluo_tags
|
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
||||||
from .iob_utils import tags_to_entities
|
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
||||||
|
|
||||||
from .gold_io import docs_to_json
|
|
||||||
from .gold_io import read_json_file
|
|
||||||
|
|
||||||
|
|
||||||
from .batchers import minibatch_by_padded_size, minibatch_by_words
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ from typing import Optional, Any
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from .example import Example
|
|
||||||
from ..util import registry, minibatch
|
from ..util import registry, minibatch
|
||||||
|
|
||||||
|
|
||||||
|
@ -41,16 +40,13 @@ def configure_minibatch_by_words(
|
||||||
) -> BatcherT:
|
) -> BatcherT:
|
||||||
optionals = {"get_length": get_length} if get_length is not None else {}
|
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||||
return partial(
|
return partial(
|
||||||
minibatch_by_words,
|
minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
|
||||||
size=size,
|
|
||||||
discard_oversize=discard_oversize,
|
|
||||||
**optionals
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.batchers("batch_by_sequence.v1")
|
@registry.batchers("batch_by_sequence.v1")
|
||||||
def configure_minibatch(size: Sizing, get_length=None) -> BatcherT:
|
def configure_minibatch(size: Sizing, get_length=None) -> BatcherT:
|
||||||
optionals = ({"get_length": get_length} if get_length is not None else {})
|
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||||
return partial(minibatch, size=size, **optionals)
|
return partial(minibatch, size=size, **optionals)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .iob2docs import iob2docs # noqa: F401
|
from .iob2docs import iob2docs # noqa: F401
|
||||||
from .conll_ner2docs import conll_ner2docs # noqa: F401
|
from .conll_ner2docs import conll_ner2docs # noqa: F401
|
||||||
from .json2docs import json2docs
|
from .json2docs import json2docs # noqa: F401
|
||||||
from .conllu2docs import conllu2docs # noqa: F401
|
from .conllu2docs import conllu2docs # noqa: F401
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable, Tuple
|
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import random
|
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .example import Example
|
from .example import Example
|
||||||
|
@ -25,7 +24,7 @@ class Corpus:
|
||||||
|
|
||||||
path (Path): The directory or filename to read from.
|
path (Path): The directory or filename to read from.
|
||||||
gold_preproc (bool): Whether to set up the Example object with gold-standard
|
gold_preproc (bool): Whether to set up the Example object with gold-standard
|
||||||
sentences and tokens for the predictions. Gold preprocessing helps
|
sentences and tokens for the predictions. Gold preprocessing helps
|
||||||
the annotations align to the tokenization, and may result in sequences
|
the annotations align to the tokenization, and may result in sequences
|
||||||
of more consistent length. However, it may reduce run-time accuracy due
|
of more consistent length. However, it may reduce run-time accuracy due
|
||||||
to train/test skew. Defaults to False.
|
to train/test skew. Defaults to False.
|
||||||
|
@ -39,7 +38,12 @@ class Corpus:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path, *, limit: int = 0, gold_preproc: bool = False, max_length: bool = False,
|
self,
|
||||||
|
path,
|
||||||
|
*,
|
||||||
|
limit: int = 0,
|
||||||
|
gold_preproc: bool = False,
|
||||||
|
max_length: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.path = util.ensure_path(path)
|
self.path = util.ensure_path(path)
|
||||||
self.gold_preproc = gold_preproc
|
self.gold_preproc = gold_preproc
|
||||||
|
|
|
@ -80,7 +80,7 @@ def _get_transition_table(
|
||||||
B_start, B_end = (0, n_labels)
|
B_start, B_end = (0, n_labels)
|
||||||
I_start, I_end = (B_end, B_end + n_labels)
|
I_start, I_end = (B_end, B_end + n_labels)
|
||||||
L_start, L_end = (I_end, I_end + n_labels)
|
L_start, L_end = (I_end, I_end + n_labels)
|
||||||
U_start, _ = (L_end, L_end + n_labels)
|
U_start, _ = (L_end, L_end + n_labels) # noqa: F841
|
||||||
# Using ranges allows us to set specific cells, which is necessary to express
|
# Using ranges allows us to set specific cells, which is necessary to express
|
||||||
# that only actions of the same label are valid continuations.
|
# that only actions of the same label are valid continuations.
|
||||||
B_range = numpy.arange(B_start, B_end)
|
B_range = numpy.arange(B_start, B_end)
|
||||||
|
|
|
@ -17,9 +17,7 @@ MatcherPatternType = List[Dict[Union[int, str], Any]]
|
||||||
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
|
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory("attribute_ruler")
|
||||||
"attribute_ruler",
|
|
||||||
)
|
|
||||||
def make_attribute_ruler(
|
def make_attribute_ruler(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -58,7 +56,7 @@ class AttributeRuler(Pipe):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.matcher = Matcher(self.vocab)
|
self.matcher = Matcher(self.vocab)
|
||||||
self.attrs = []
|
self.attrs = []
|
||||||
self._attrs_unnormed = [] # store for reference
|
self._attrs_unnormed = [] # store for reference
|
||||||
self.indices = []
|
self.indices = []
|
||||||
|
|
||||||
if pattern_dicts:
|
if pattern_dicts:
|
||||||
|
|
|
@ -1,17 +1,23 @@
|
||||||
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type
|
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type
|
||||||
from typing import Iterable, TypeVar
|
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator
|
from pydantic import BaseModel, Field, ValidationError, validator
|
||||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||||
from pydantic import root_validator
|
from pydantic import root_validator
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from thinc.api import Optimizer
|
from thinc.api import Optimizer
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from .attrs import NAMES
|
from .attrs import NAMES
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
from .language import Language # noqa: F401
|
||||||
|
from .gold import Example # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
ItemT = TypeVar("ItemT")
|
ItemT = TypeVar("ItemT")
|
||||||
Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
||||||
|
Reader = Callable[["Language", str], Iterable["Example"]]
|
||||||
|
|
||||||
|
|
||||||
def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
|
def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
|
||||||
|
@ -183,7 +189,6 @@ class ModelMetaSchema(BaseModel):
|
||||||
# check that against this schema in the test suite to make sure it's always
|
# check that against this schema in the test suite to make sure it's always
|
||||||
# up to date.
|
# up to date.
|
||||||
|
|
||||||
Reader = Callable[["Language", str], Iterable["Example"]]
|
|
||||||
|
|
||||||
class ConfigSchemaTraining(BaseModel):
|
class ConfigSchemaTraining(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
@ -209,7 +214,6 @@ class ConfigSchemaTraining(BaseModel):
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
#eval_batch_size: StrictInt = Field(..., title="Evaluation batch size")
|
|
||||||
|
|
||||||
class ConfigSchemaNlp(BaseModel):
|
class ConfigSchemaNlp(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
|
|
@ -291,6 +291,6 @@ def test_span_boundaries(doc):
|
||||||
for i in range(start, end):
|
for i in range(start, end):
|
||||||
assert span[i - start] == doc[i]
|
assert span[i - start] == doc[i]
|
||||||
with pytest.raises(IndexError):
|
with pytest.raises(IndexError):
|
||||||
_ = span[-5]
|
span[-5]
|
||||||
with pytest.raises(IndexError):
|
with pytest.raises(IndexError):
|
||||||
_ = span[5]
|
span[5]
|
||||||
|
|
|
@ -29,9 +29,7 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
||||||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||||
nlp = Chinese(
|
nlp = Chinese(
|
||||||
meta={
|
meta={
|
||||||
"tokenizer": {
|
"tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
|
||||||
"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine",}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
zh_tokenizer_serialize(nlp.tokenizer)
|
zh_tokenizer_serialize(nlp.tokenizer)
|
||||||
|
|
|
@ -21,7 +21,7 @@ re_pattern5 = "B*A*B"
|
||||||
longest1 = "A A A A A"
|
longest1 = "A A A A A"
|
||||||
longest2 = "A A A A A"
|
longest2 = "A A A A A"
|
||||||
longest3 = "A A"
|
longest3 = "A A"
|
||||||
longest4 = "B A A A A A B" # "FIRST" would be "B B"
|
longest4 = "B A A A A A B" # "FIRST" would be "B B"
|
||||||
longest5 = "B B A A A A A B"
|
longest5 = "B B A A A A A B"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle, is_nonproj_arc
|
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
|
||||||
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree
|
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
|
||||||
from spacy.pipeline._parser_internals import nonproj
|
from spacy.pipeline._parser_internals import nonproj
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
|
@ -75,19 +75,18 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
||||||
|
|
||||||
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||||
# initialize with patterns
|
# initialize with patterns
|
||||||
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||||
|
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||||
assert doc[3].lemma_ == "cat"
|
assert doc[3].lemma_ == "cat"
|
||||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||||
|
|
||||||
nlp.remove_pipe("attribute_ruler")
|
nlp.remove_pipe("attribute_ruler")
|
||||||
|
|
||||||
# initialize with patterns from asset
|
# initialize with patterns from asset
|
||||||
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}})
|
nlp.add_pipe(
|
||||||
|
"attribute_ruler",
|
||||||
|
config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
|
||||||
|
)
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert doc[2].lemma_ == "the"
|
assert doc[2].lemma_ == "the"
|
||||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||||
|
|
|
@ -117,12 +117,15 @@ def test_kb_default(nlp):
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
assert entity_linker.kb.get_size_entities() == 0
|
assert entity_linker.kb.get_size_entities() == 0
|
||||||
assert entity_linker.kb.get_size_aliases() == 0
|
assert entity_linker.kb.get_size_aliases() == 0
|
||||||
assert entity_linker.kb.entity_vector_length == 64 # default value from pipeline.entity_linker
|
# default value from pipeline.entity_linker
|
||||||
|
assert entity_linker.kb.entity_vector_length == 64
|
||||||
|
|
||||||
|
|
||||||
def test_kb_custom_length(nlp):
|
def test_kb_custom_length(nlp):
|
||||||
"""Test that the default (empty) KB can be configured with a custom entity length"""
|
"""Test that the default (empty) KB can be configured with a custom entity length"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={"kb": {"entity_vector_length": 35}})
|
entity_linker = nlp.add_pipe(
|
||||||
|
"entity_linker", config={"kb": {"entity_vector_length": 35}}
|
||||||
|
)
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
assert entity_linker.kb.get_size_entities() == 0
|
assert entity_linker.kb.get_size_entities() == 0
|
||||||
assert entity_linker.kb.get_size_aliases() == 0
|
assert entity_linker.kb.get_size_aliases() == 0
|
||||||
|
|
|
@ -117,9 +117,7 @@ def test_overfitting_IO():
|
||||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||||
|
|
||||||
# Test scoring
|
# Test scoring
|
||||||
scores = nlp.evaluate(
|
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
||||||
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
|
|
||||||
)
|
|
||||||
assert scores["cats_f"] == 1.0
|
assert scores["cats_f"] == 1.0
|
||||||
assert scores["cats_score"] == 1.0
|
assert scores["cats_score"] == 1.0
|
||||||
assert "cats_score_desc" in scores
|
assert "cats_score_desc" in scores
|
||||||
|
|
|
@ -88,14 +88,9 @@ def my_parser():
|
||||||
width=321,
|
width=321,
|
||||||
rows=5432,
|
rows=5432,
|
||||||
also_embed_subwords=True,
|
also_embed_subwords=True,
|
||||||
also_use_static_vectors=False
|
also_use_static_vectors=False,
|
||||||
),
|
),
|
||||||
MaxoutWindowEncoder(
|
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
|
||||||
width=321,
|
|
||||||
window_size=3,
|
|
||||||
maxout_pieces=4,
|
|
||||||
depth=2
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
parser = build_tb_parser_model(
|
parser = build_tb_parser_model(
|
||||||
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
|
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import spacy
|
import spacy
|
||||||
import pytest
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
|
|
||||||
|
|
|
@ -711,16 +711,18 @@ def test_alignment_different_texts():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
Alignment.from_strings(other_tokens, spacy_tokens)
|
Alignment.from_strings(other_tokens, spacy_tokens)
|
||||||
|
|
||||||
|
|
||||||
def test_retokenized_docs(doc):
|
def test_retokenized_docs(doc):
|
||||||
a = doc.to_array(["TAG"])
|
a = doc.to_array(["TAG"])
|
||||||
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
||||||
doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
||||||
example = Example(doc1, doc2)
|
example = Example(doc1, doc2)
|
||||||
|
# fmt: off
|
||||||
assert example.get_aligned("ORTH", as_string=True) == ['Sarah', "'s", 'sister', 'flew', 'to', 'Silicon', 'Valley', 'via', 'London', '.']
|
expected1 = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
|
||||||
|
expected2 = [None, "sister", "flew", "to", None, "via", "London", "."]
|
||||||
|
# fmt: on
|
||||||
|
assert example.get_aligned("ORTH", as_string=True) == expected1
|
||||||
with doc1.retokenize() as retokenizer:
|
with doc1.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc1[0:2])
|
retokenizer.merge(doc1[0:2])
|
||||||
retokenizer.merge(doc1[5:7])
|
retokenizer.merge(doc1[5:7])
|
||||||
|
assert example.get_aligned("ORTH", as_string=True) == expected2
|
||||||
assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ def get_textcat_kwargs():
|
||||||
"nO": 7,
|
"nO": 7,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_textcat_cnn_kwargs():
|
def get_textcat_cnn_kwargs():
|
||||||
return {
|
return {
|
||||||
"tok2vec": test_tok2vec(),
|
"tok2vec": test_tok2vec(),
|
||||||
|
@ -31,6 +32,7 @@ def get_textcat_cnn_kwargs():
|
||||||
"nO": 13,
|
"nO": 13,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_all_params(model):
|
def get_all_params(model):
|
||||||
params = []
|
params = []
|
||||||
for node in model.walk():
|
for node in model.walk():
|
||||||
|
@ -59,17 +61,11 @@ def get_tok2vec_kwargs():
|
||||||
# This actually creates models, so seems best to put it in a function.
|
# This actually creates models, so seems best to put it in a function.
|
||||||
return {
|
return {
|
||||||
"embed": MultiHashEmbed(
|
"embed": MultiHashEmbed(
|
||||||
width=32,
|
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
|
||||||
rows=500,
|
|
||||||
also_embed_subwords=True,
|
|
||||||
also_use_static_vectors=False
|
|
||||||
),
|
),
|
||||||
"encode": MaxoutWindowEncoder(
|
"encode": MaxoutWindowEncoder(
|
||||||
width=32,
|
width=32, depth=2, maxout_pieces=2, window_size=1,
|
||||||
depth=2,
|
),
|
||||||
maxout_pieces=2,
|
|
||||||
window_size=1,
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,14 +19,9 @@ def test_empty_doc():
|
||||||
width=width,
|
width=width,
|
||||||
rows=embed_size,
|
rows=embed_size,
|
||||||
also_use_static_vectors=False,
|
also_use_static_vectors=False,
|
||||||
also_embed_subwords=True
|
also_embed_subwords=True,
|
||||||
),
|
),
|
||||||
MaxoutWindowEncoder(
|
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
||||||
width=width,
|
|
||||||
depth=4,
|
|
||||||
window_size=1,
|
|
||||||
maxout_pieces=3
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
tok2vec.initialize()
|
tok2vec.initialize()
|
||||||
vectors, backprop = tok2vec.begin_update([doc])
|
vectors, backprop = tok2vec.begin_update([doc])
|
||||||
|
@ -44,14 +39,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
width=width,
|
width=width,
|
||||||
rows=embed_size,
|
rows=embed_size,
|
||||||
also_use_static_vectors=False,
|
also_use_static_vectors=False,
|
||||||
also_embed_subwords=True
|
also_embed_subwords=True,
|
||||||
),
|
),
|
||||||
MaxoutWindowEncoder(
|
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3,),
|
||||||
width=width,
|
|
||||||
depth=4,
|
|
||||||
window_size=1,
|
|
||||||
maxout_pieces=3,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
tok2vec.initialize()
|
tok2vec.initialize()
|
||||||
vectors, backprop = tok2vec.begin_update(batch)
|
vectors, backprop = tok2vec.begin_update(batch)
|
||||||
|
|
|
@ -85,27 +85,24 @@ def test_util_dot_section():
|
||||||
"""
|
"""
|
||||||
nlp_config = Config().from_str(cfg_string)
|
nlp_config = Config().from_str(cfg_string)
|
||||||
en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True)
|
en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True)
|
||||||
|
|
||||||
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
|
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
|
||||||
default_config["nlp"]["lang"] = "nl"
|
default_config["nlp"]["lang"] = "nl"
|
||||||
nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True)
|
nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True)
|
||||||
|
|
||||||
# Test that creation went OK
|
# Test that creation went OK
|
||||||
assert isinstance(en_nlp, English)
|
assert isinstance(en_nlp, English)
|
||||||
assert isinstance(nl_nlp, Dutch)
|
assert isinstance(nl_nlp, Dutch)
|
||||||
assert nl_nlp.pipe_names == []
|
assert nl_nlp.pipe_names == []
|
||||||
assert en_nlp.pipe_names == ["textcat"]
|
assert en_nlp.pipe_names == ["textcat"]
|
||||||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] == False # not exclusive_classes
|
# not exclusive_classes
|
||||||
|
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
||||||
# Test that default values got overwritten
|
# Test that default values got overwritten
|
||||||
assert not en_config["nlp"]["load_vocab_data"]
|
assert not en_config["nlp"]["load_vocab_data"]
|
||||||
assert nl_config["nlp"]["load_vocab_data"] # default value True
|
assert nl_config["nlp"]["load_vocab_data"] # default value True
|
||||||
|
|
||||||
# Test proper functioning of 'dot_to_object'
|
# Test proper functioning of 'dot_to_object'
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
obj = dot_to_object(en_config, "nlp.pipeline.tagger")
|
dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
obj = dot_to_object(en_config, "nlp.unknownattribute")
|
dot_to_object(en_config, "nlp.unknownattribute")
|
||||||
assert not dot_to_object(en_config, "nlp.load_vocab_data")
|
assert not dot_to_object(en_config, "nlp.load_vocab_data")
|
||||||
assert dot_to_object(nl_config, "nlp.load_vocab_data")
|
assert dot_to_object(nl_config, "nlp.load_vocab_data")
|
||||||
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
|
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
|
from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
|
||||||
from typing import Iterator, Type, Pattern, Sequence, TYPE_CHECKING
|
from typing import Iterator, Type, Pattern, TYPE_CHECKING
|
||||||
from types import ModuleType
|
from types import ModuleType
|
||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
|
@ -764,7 +764,6 @@ def normalize_slice(
|
||||||
return start, stop
|
return start, stop
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
|
def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
|
||||||
"""Filter a sequence of spans and remove duplicates or overlaps. Useful for
|
"""Filter a sequence of spans and remove duplicates or overlaps. Useful for
|
||||||
creating named entities (where one token can only be part of one entity) or
|
creating named entities (where one token can only be part of one entity) or
|
||||||
|
@ -1113,6 +1112,3 @@ def minibatch(items, size):
|
||||||
if len(batch) == 0:
|
if len(batch) == 0:
|
||||||
break
|
break
|
||||||
yield list(batch)
|
yield list(batch)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user