diff --git a/pyproject.toml b/pyproject.toml index 611a95d27..d48886e0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", "thinc>=8.0.0a43,<8.0.0a50", - "blis>=0.4.0,<0.5.0", + "blis>=0.4.0,<0.8.0", "pytokenizations", "pathy" ] diff --git a/requirements.txt b/requirements.txt index 44dad38e3..29695e9b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.0a43,<8.0.0a50 -blis>=0.4.0,<0.5.0 +blis>=0.4.0,<0.8.0 ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 7192ba9d4..d8362c4bd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ install_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.0a43,<8.0.0a50 - blis>=0.4.0,<0.5.0 + blis>=0.4.0,<0.8.0 wasabi>=0.8.0,<1.1.0 srsly>=2.3.0,<3.0.0 catalogue>=2.0.1,<2.1.0 diff --git a/spacy/about.py b/spacy/about.py index 9329b48e6..373d1d2b0 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a33" +__version__ = "3.0.0a34" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/errors.py b/spacy/errors.py index 20edf45b5..bf3628ce9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,10 +456,14 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master - E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " + E901 = ("Failed to remove existing output directory: {path}. If your " + "config and the components you train change between runs, a " + "non-empty output directory can lead to stale pipeline data. To " + "solve this, remove the existing directories in the output directory.") + E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " "Try checking whitespace and delimiters. See " "https://nightly.spacy.io/api/cli#convert") - E093 = ("The token-per-line NER file is not formatted correctly. Try checking " + E903 = ("The token-per-line NER file is not formatted correctly. Try checking " "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert") E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This " "dimension refers to the output width, after the linear projection " diff --git a/spacy/lookups.py b/spacy/lookups.py index fb5e3d748..133cb0672 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -289,13 +289,12 @@ class Lookups: DOCS: https://nightly.spacy.io/api/lookups#to_disk """ - if len(self._tables): - path = ensure_path(path) - if not path.exists(): - path.mkdir() - filepath = path / filename - with filepath.open("wb") as file_: - file_.write(self.to_bytes()) + path = ensure_path(path) + if not path.exists(): + path.mkdir() + filepath = path / filename + with filepath.open("wb") as file_: + file_.write(self.to_bytes()) def from_disk( self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 82f3bf37d..6d97b062f 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -210,7 +210,7 @@ class Morphologizer(Tagger): examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. - RETUTNRS (Tuple[float, float]): The loss and the gradient. + RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://nightly.spacy.io/api/morphologizer#get_loss """ diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 41ca23ace..8e103a638 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -162,7 +162,7 @@ cdef class Pipe: examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. - RETUTNRS (Tuple[float, float]): The loss and the gradient. + RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://nightly.spacy.io/api/pipe#get_loss """ diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 0bfef7c7b..8fb1e664f 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -104,7 +104,7 @@ class SentenceRecognizer(Tagger): examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. - RETUTNRS (Tuple[float, float]): The loss and the gradient. + RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss """ diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 6cb582b36..94ac0c082 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -249,7 +249,7 @@ class Tagger(Pipe): examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. - RETUTNRS (Tuple[float, float]): The loss and the gradient. + RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://nightly.spacy.io/api/tagger#get_loss """ diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index fc60ebf89..292598e3a 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -281,7 +281,7 @@ class TextCategorizer(Pipe): examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. - RETUTNRS (Tuple[float, float]): The loss and the gradient. + RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss """ diff --git a/spacy/training/augment.py b/spacy/training/augment.py index e6d10a195..e76ee49f7 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -5,7 +5,7 @@ import copy from functools import partial from pydantic import BaseModel, StrictStr -from ..util import registry, logger +from ..util import registry from ..tokens import Doc from .example import Example @@ -119,9 +119,8 @@ def make_orth_variants( orig_token_dict = copy.deepcopy(token_dict) ndsv = orth_variants.get("single", []) ndpv = orth_variants.get("paired", []) - logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants") - words = token_dict.get("words", []) - tags = token_dict.get("tags", []) + words = token_dict.get("ORTH", []) + tags = token_dict.get("TAG", []) # keep unmodified if words or tags are not defined if words and tags: if lower: @@ -154,8 +153,8 @@ def make_orth_variants( if words[word_idx] in pair: pair_idx = pair.index(words[word_idx]) words[word_idx] = punct_choices[punct_idx][pair_idx] - token_dict["words"] = words - token_dict["tags"] = tags + token_dict["ORTH"] = words + token_dict["TAG"] = tags # modify raw if raw is not None: variants = [] diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index 28f0f87c3..c01686aee 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -103,7 +103,7 @@ def conll_ner_to_docs( lines = [line.strip() for line in conll_sent.split("\n") if line.strip()] cols = list(zip(*[line.split() for line in lines])) if len(cols) < 2: - raise ValueError(Errors.E093) + raise ValueError(Errors.E903) length = len(cols[0]) words.extend(cols[0]) sent_starts.extend([True] + [False] * (length - 1)) diff --git a/spacy/training/converters/iob_to_docs.py b/spacy/training/converters/iob_to_docs.py index 73ad8953d..a2185fef7 100644 --- a/spacy/training/converters/iob_to_docs.py +++ b/spacy/training/converters/iob_to_docs.py @@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents): sent_words, sent_iob = zip(*sent_tokens) sent_tags = ["-"] * len(sent_words) else: - raise ValueError(Errors.E092) + raise ValueError(Errors.E902) words.extend(sent_words) tags.extend(sent_tags) iob.extend(sent_iob) diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 0d4414964..67f61567e 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -3,19 +3,24 @@ from typing import Optional, TYPE_CHECKING from pathlib import Path from timeit import default_timer as timer from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator +from wasabi import Printer import random -import wasabi import sys +import shutil from .example import Example from ..schemas import ConfigSchemaTraining from ..errors import Errors -from ..util import resolve_dot_names, registry +from ..util import resolve_dot_names, registry, logger if TYPE_CHECKING: from ..language import Language # noqa: F401 +DIR_MODEL_BEST = "model-best" +DIR_MODEL_LAST = "model-last" + + def train( nlp: "Language", output_path: Optional[Path] = None, @@ -38,7 +43,7 @@ def train( RETURNS (Path / None): The path to the final exported model. """ # We use no_print here so we can respect the stdout/stderr options. - msg = wasabi.Printer(no_print=True) + msg = Printer(no_print=True) # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: @@ -69,6 +74,7 @@ def train( eval_frequency=T["eval_frequency"], exclude=frozen_components, ) + clean_output_dir(output_path) stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") if frozen_components: stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n") @@ -83,7 +89,7 @@ def train( update_meta(T, nlp, info) with nlp.use_params(optimizer.averages): nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-best") + nlp.to_disk(output_path / DIR_MODEL_BEST) except Exception as e: if output_path is not None: # We don't want to swallow the traceback if we don't have a @@ -100,7 +106,7 @@ def train( finally: finalize_logger() if output_path is not None: - final_model_path = output_path / "model-last" + final_model_path = output_path / DIR_MODEL_LAST if optimizer.averages: with nlp.use_params(optimizer.averages): nlp.to_disk(final_model_path) @@ -305,3 +311,19 @@ def create_before_to_disk_callback( return modified_nlp return before_to_disk + + +def clean_output_dir(path: Union[str, Path]) -> None: + """Remove an existing output directory. Typically used to ensure that that + a directory like model-best and its contents aren't just being overwritten + by nlp.to_disk, which could preserve existing subdirectories (e.g. + components that don't exist anymore). + """ + if path is not None and path.exists(): + for subdir in [path / DIR_MODEL_BEST, path / DIR_MODEL_LAST]: + if subdir.exists(): + try: + shutil.rmtree(str(subdir)) + logger.debug(f"Removed existing output directory: {subdir}") + except Exception as e: + raise IOError(Errors.E901.format(path=path)) from e diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a22f12c65..93918250b 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -445,9 +445,9 @@ cdef class Vocab: setters = ["strings", "vectors"] if "strings" not in exclude: self.strings.to_disk(path / "strings.json") - if "vectors" not in "exclude" and self.vectors is not None: + if "vectors" not in "exclude": self.vectors.to_disk(path) - if "lookups" not in "exclude" and self.lookups is not None: + if "lookups" not in "exclude": self.lookups.to_disk(path) def from_disk(self, path, *, exclude=tuple()): diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass index a08d6bcb6..8ad106a78 100644 --- a/website/src/styles/quickstart.module.sass +++ b/website/src/styles/quickstart.module.sass @@ -38,7 +38,7 @@ cursor: pointer display: inline-block padding: 0.35rem 0.5rem 0.25rem 0 - margin: 0 1rem 0.75rem 0 + margin: 0 1rem 0.5rem 0 font-size: var(--font-size-xs) font-weight: bold @@ -73,16 +73,19 @@ background: var(--color-theme) .checkbox + &:before + $size: 18px content: "" display: inline-block - width: 20px - height: 20px + width: $size + height: $size border: 1px solid var(--color-subtle) vertical-align: middle margin-right: 0.5rem cursor: pointer - border-radius: var(--border-radius) + border-radius: $size / 4 background: var(--color-back) + position: relative + top: -1px .checkbox:checked + &:before // Embed "check" icon here for simplicity diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index 741973945..a8bdf21dc 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -4,6 +4,8 @@ import { StaticQuery, graphql } from 'gatsby' import { Quickstart, QS } from '../components/quickstart' import { repo } from '../components/util' +const DEFAULT_MODELS = ['en'] +const DEFAULT_OPT = 'efficiency' const DEFAULT_HARDWARE = 'cpu' const DEFAULT_CUDA = 'cuda100' const CUDA = { @@ -68,9 +70,13 @@ const QuickstartInstall = ({ id, title }) => { const [train, setTrain] = useState(false) const [hardware, setHardware] = useState(DEFAULT_HARDWARE) const [cuda, setCuda] = useState(DEFAULT_CUDA) + const [selectedModels, setModels] = useState(DEFAULT_MODELS) + const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency') const setters = { hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)), config: v => setTrain(v.includes('train')), + models: setModels, + optimize: v => setEfficiency(v.includes('efficiency')), } const showDropdown = { hardware: () => hardware === 'gpu', @@ -89,13 +95,37 @@ const QuickstartInstall = ({ id, title }) => { ...DATA, { id: 'models', - title: 'Trained Pipelines', + title: 'Trained pipelines', multiple: true, options: models .sort((a, b) => a.name.localeCompare(b.name)) - .map(({ code, name }) => ({ id: code, title: name })), + .map(({ code, name }) => ({ + id: code, + title: name, + checked: DEFAULT_MODELS.includes(code), + })), }, ] + if (selectedModels.length) { + data.push({ + id: 'optimize', + title: 'Select pipeline for', + options: [ + { + id: 'efficiency', + title: 'efficiency', + checked: DEFAULT_OPT === 'efficiency', + help: 'Faster and smaller pipeline, but less accurate', + }, + { + id: 'accuracy', + title: 'accuracy', + checked: DEFAULT_OPT === 'accuracy', + help: 'Larger and slower pipeline, but more accurate', + }, + ], + }) + } return ( { conda install -c conda-forge spacy-lookups-data - {models.map(({ code, models: modelOptions }) => ( - - python -m spacy download {modelOptions[0]} - - ))} + {models.map(({ code, models: modelOptions }) => { + const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1] + return ( + + python -m spacy download {pkg} + + ) + })} ) }} diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js index ffd1b3df9..5f94c60cb 100644 --- a/website/src/widgets/quickstart-models.js +++ b/website/src/widgets/quickstart-models.js @@ -31,25 +31,33 @@ const data = [ }, { id: 'optimize', - title: 'Optimize for', - help: - 'Optimize for efficiency (faster & smaller model) or higher accuracy (larger & slower model)', + title: 'Select for', options: [ - { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' }, - { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' }, + { + id: 'efficiency', + title: 'efficiency', + checked: DEFAULT_OPT === 'efficiency', + help: 'Faster and smaller pipeline, but less accurate', + }, + { + id: 'accuracy', + title: 'accuracy', + checked: DEFAULT_OPT === 'accuracy', + help: 'Larger and slower pipeline, but more accurate', + }, ], }, { id: 'config', title: 'Options', multiple: true, - options: [{ id: 'example', title: 'Show usage example' }], + options: [{ id: 'example', title: 'Show text example' }], }, ] const QuickstartInstall = ({ id, title, description, children }) => { const [lang, setLang] = useState(DEFAULT_LANG) - const [efficiency, setEfficiency] = useState(DEFAULT_OPT) + const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency') const setters = { lang: setLang, optimize: v => setEfficiency(v.includes('efficiency')),