Fix augment

This commit is contained in:
Matthew Honnibal 2020-10-05 16:41:45 +02:00
commit 8deed614e9
12 changed files with 19 additions and 21 deletions

View File

@ -7,7 +7,7 @@ requires = [
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a43,<8.0.0a50", "thinc>=8.0.0a43,<8.0.0a50",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.8.0",
"pytokenizations", "pytokenizations",
"pathy" "pathy"
] ]

View File

@ -2,7 +2,7 @@
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50 thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.8.0
ml_datasets==0.2.0a0 ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0

View File

@ -41,7 +41,7 @@ install_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50 thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.8.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.3.0,<3.0.0 srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0 catalogue>=2.0.1,<2.1.0

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a33" __version__ = "3.0.0a34"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -289,13 +289,12 @@ class Lookups:
DOCS: https://nightly.spacy.io/api/lookups#to_disk DOCS: https://nightly.spacy.io/api/lookups#to_disk
""" """
if len(self._tables): path = ensure_path(path)
path = ensure_path(path) if not path.exists():
if not path.exists(): path.mkdir()
path.mkdir() filepath = path / filename
filepath = path / filename with filepath.open("wb") as file_:
with filepath.open("wb") as file_: file_.write(self.to_bytes())
file_.write(self.to_bytes())
def from_disk( def from_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs

View File

@ -210,7 +210,7 @@ class Morphologizer(Tagger):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/morphologizer#get_loss DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
""" """

View File

@ -162,7 +162,7 @@ cdef class Pipe:
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/pipe#get_loss DOCS: https://nightly.spacy.io/api/pipe#get_loss
""" """

View File

@ -104,7 +104,7 @@ class SentenceRecognizer(Tagger):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
""" """

View File

@ -249,7 +249,7 @@ class Tagger(Pipe):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/tagger#get_loss DOCS: https://nightly.spacy.io/api/tagger#get_loss
""" """

View File

@ -281,7 +281,7 @@ class TextCategorizer(Pipe):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
""" """

View File

@ -5,7 +5,7 @@ import copy
from functools import partial from functools import partial
from pydantic import BaseModel, StrictStr from pydantic import BaseModel, StrictStr
from ..util import registry, logger from ..util import registry
from ..tokens import Doc from ..tokens import Doc
from .example import Example from .example import Example
@ -119,7 +119,6 @@ def make_orth_variants(
orig_token_dict = copy.deepcopy(token_dict) orig_token_dict = copy.deepcopy(token_dict)
ndsv = orth_variants.get("single", []) ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", []) ndpv = orth_variants.get("paired", [])
logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
words = token_dict.get("ORTH", []) words = token_dict.get("ORTH", [])
tags = token_dict.get("TAG", []) tags = token_dict.get("TAG", [])
# keep unmodified if words or tags are not defined # keep unmodified if words or tags are not defined
@ -139,7 +138,7 @@ def make_orth_variants(
punct_choices = [random.choice(x["variants"]) for x in ndpv] punct_choices = [random.choice(x["variants"]) for x in ndpv]
for word_idx in range(len(words)): for word_idx in range(len(words)):
for punct_idx in range(len(ndpv)): for punct_idx in range(len(ndpv)):
if tags[word_idx] in ndpv[punct_idx]["TAG"] and words[ if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
word_idx word_idx
] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
# backup option: random left vs. right from pair # backup option: random left vs. right from pair

View File

@ -445,9 +445,9 @@ cdef class Vocab:
setters = ["strings", "vectors"] setters = ["strings", "vectors"]
if "strings" not in exclude: if "strings" not in exclude:
self.strings.to_disk(path / "strings.json") self.strings.to_disk(path / "strings.json")
if "vectors" not in "exclude" and self.vectors is not None: if "vectors" not in "exclude":
self.vectors.to_disk(path) self.vectors.to_disk(path)
if "lookups" not in "exclude" and self.lookups is not None: if "lookups" not in "exclude":
self.lookups.to_disk(path) self.lookups.to_disk(path)
def from_disk(self, path, *, exclude=tuple()): def from_disk(self, path, *, exclude=tuple()):