Merge branch 'develop' into feature/project-spacy-version

This commit is contained in:
Ines Montani 2020-10-05 21:06:07 +02:00
commit 9ca283a899
19 changed files with 119 additions and 51 deletions

View File

@ -7,7 +7,7 @@ requires = [
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a43,<8.0.0a50", "thinc>=8.0.0a43,<8.0.0a50",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.8.0",
"pytokenizations", "pytokenizations",
"pathy" "pathy"
] ]

View File

@ -2,7 +2,7 @@
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50 thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.8.0
ml_datasets==0.2.0a0 ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0

View File

@ -41,7 +41,7 @@ install_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50 thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.8.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.3.0,<3.0.0 srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0 catalogue>=2.0.1,<2.1.0

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a33" __version__ = "3.0.0a34"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -456,10 +456,14 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " E901 = ("Failed to remove existing output directory: {path}. If your "
"config and the components you train change between runs, a "
"non-empty output directory can lead to stale pipeline data. To "
"solve this, remove the existing directories in the output directory.")
E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
"Try checking whitespace and delimiters. See " "Try checking whitespace and delimiters. See "
"https://nightly.spacy.io/api/cli#convert") "https://nightly.spacy.io/api/cli#convert")
E093 = ("The token-per-line NER file is not formatted correctly. Try checking " E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
"whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert") "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This " E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
"dimension refers to the output width, after the linear projection " "dimension refers to the output width, after the linear projection "

View File

@ -289,7 +289,6 @@ class Lookups:
DOCS: https://nightly.spacy.io/api/lookups#to_disk DOCS: https://nightly.spacy.io/api/lookups#to_disk
""" """
if len(self._tables):
path = ensure_path(path) path = ensure_path(path)
if not path.exists(): if not path.exists():
path.mkdir() path.mkdir()

View File

@ -210,7 +210,7 @@ class Morphologizer(Tagger):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/morphologizer#get_loss DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
""" """

View File

@ -162,7 +162,7 @@ cdef class Pipe:
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/pipe#get_loss DOCS: https://nightly.spacy.io/api/pipe#get_loss
""" """

View File

@ -104,7 +104,7 @@ class SentenceRecognizer(Tagger):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
""" """

View File

@ -249,7 +249,7 @@ class Tagger(Pipe):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/tagger#get_loss DOCS: https://nightly.spacy.io/api/tagger#get_loss
""" """

View File

@ -281,7 +281,7 @@ class TextCategorizer(Pipe):
examples (Iterable[Examples]): The batch of examples. examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions. scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient. RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
""" """

View File

@ -5,7 +5,7 @@ import copy
from functools import partial from functools import partial
from pydantic import BaseModel, StrictStr from pydantic import BaseModel, StrictStr
from ..util import registry, logger from ..util import registry
from ..tokens import Doc from ..tokens import Doc
from .example import Example from .example import Example
@ -119,9 +119,8 @@ def make_orth_variants(
orig_token_dict = copy.deepcopy(token_dict) orig_token_dict = copy.deepcopy(token_dict)
ndsv = orth_variants.get("single", []) ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", []) ndpv = orth_variants.get("paired", [])
logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants") words = token_dict.get("ORTH", [])
words = token_dict.get("words", []) tags = token_dict.get("TAG", [])
tags = token_dict.get("tags", [])
# keep unmodified if words or tags are not defined # keep unmodified if words or tags are not defined
if words and tags: if words and tags:
if lower: if lower:
@ -154,8 +153,8 @@ def make_orth_variants(
if words[word_idx] in pair: if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx]) pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx] words[word_idx] = punct_choices[punct_idx][pair_idx]
token_dict["words"] = words token_dict["ORTH"] = words
token_dict["tags"] = tags token_dict["TAG"] = tags
# modify raw # modify raw
if raw is not None: if raw is not None:
variants = [] variants = []

View File

@ -103,7 +103,7 @@ def conll_ner_to_docs(
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()] lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
cols = list(zip(*[line.split() for line in lines])) cols = list(zip(*[line.split() for line in lines]))
if len(cols) < 2: if len(cols) < 2:
raise ValueError(Errors.E093) raise ValueError(Errors.E903)
length = len(cols[0]) length = len(cols[0])
words.extend(cols[0]) words.extend(cols[0])
sent_starts.extend([True] + [False] * (length - 1)) sent_starts.extend([True] + [False] * (length - 1))

View File

@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents):
sent_words, sent_iob = zip(*sent_tokens) sent_words, sent_iob = zip(*sent_tokens)
sent_tags = ["-"] * len(sent_words) sent_tags = ["-"] * len(sent_words)
else: else:
raise ValueError(Errors.E092) raise ValueError(Errors.E902)
words.extend(sent_words) words.extend(sent_words)
tags.extend(sent_tags) tags.extend(sent_tags)
iob.extend(sent_iob) iob.extend(sent_iob)

View File

@ -3,19 +3,24 @@ from typing import Optional, TYPE_CHECKING
from pathlib import Path from pathlib import Path
from timeit import default_timer as timer from timeit import default_timer as timer
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
from wasabi import Printer
import random import random
import wasabi
import sys import sys
import shutil
from .example import Example from .example import Example
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
from ..errors import Errors from ..errors import Errors
from ..util import resolve_dot_names, registry from ..util import resolve_dot_names, registry, logger
if TYPE_CHECKING: if TYPE_CHECKING:
from ..language import Language # noqa: F401 from ..language import Language # noqa: F401
DIR_MODEL_BEST = "model-best"
DIR_MODEL_LAST = "model-last"
def train( def train(
nlp: "Language", nlp: "Language",
output_path: Optional[Path] = None, output_path: Optional[Path] = None,
@ -38,7 +43,7 @@ def train(
RETURNS (Path / None): The path to the final exported model. RETURNS (Path / None): The path to the final exported model.
""" """
# We use no_print here so we can respect the stdout/stderr options. # We use no_print here so we can respect the stdout/stderr options.
msg = wasabi.Printer(no_print=True) msg = Printer(no_print=True)
# Create iterator, which yields out info after each optimization step. # Create iterator, which yields out info after each optimization step.
config = nlp.config.interpolate() config = nlp.config.interpolate()
if config["training"]["seed"] is not None: if config["training"]["seed"] is not None:
@ -69,6 +74,7 @@ def train(
eval_frequency=T["eval_frequency"], eval_frequency=T["eval_frequency"],
exclude=frozen_components, exclude=frozen_components,
) )
clean_output_dir(output_path)
stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
if frozen_components: if frozen_components:
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n") stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
@ -83,7 +89,7 @@ def train(
update_meta(T, nlp, info) update_meta(T, nlp, info)
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
nlp = before_to_disk(nlp) nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-best") nlp.to_disk(output_path / DIR_MODEL_BEST)
except Exception as e: except Exception as e:
if output_path is not None: if output_path is not None:
# We don't want to swallow the traceback if we don't have a # We don't want to swallow the traceback if we don't have a
@ -100,7 +106,7 @@ def train(
finally: finally:
finalize_logger() finalize_logger()
if output_path is not None: if output_path is not None:
final_model_path = output_path / "model-last" final_model_path = output_path / DIR_MODEL_LAST
if optimizer.averages: if optimizer.averages:
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
nlp.to_disk(final_model_path) nlp.to_disk(final_model_path)
@ -305,3 +311,19 @@ def create_before_to_disk_callback(
return modified_nlp return modified_nlp
return before_to_disk return before_to_disk
def clean_output_dir(path: Union[str, Path]) -> None:
"""Remove an existing output directory. Typically used to ensure that that
a directory like model-best and its contents aren't just being overwritten
by nlp.to_disk, which could preserve existing subdirectories (e.g.
components that don't exist anymore).
"""
if path is not None and path.exists():
for subdir in [path / DIR_MODEL_BEST, path / DIR_MODEL_LAST]:
if subdir.exists():
try:
shutil.rmtree(str(subdir))
logger.debug(f"Removed existing output directory: {subdir}")
except Exception as e:
raise IOError(Errors.E901.format(path=path)) from e

View File

@ -445,9 +445,9 @@ cdef class Vocab:
setters = ["strings", "vectors"] setters = ["strings", "vectors"]
if "strings" not in exclude: if "strings" not in exclude:
self.strings.to_disk(path / "strings.json") self.strings.to_disk(path / "strings.json")
if "vectors" not in "exclude" and self.vectors is not None: if "vectors" not in "exclude":
self.vectors.to_disk(path) self.vectors.to_disk(path)
if "lookups" not in "exclude" and self.lookups is not None: if "lookups" not in "exclude":
self.lookups.to_disk(path) self.lookups.to_disk(path)
def from_disk(self, path, *, exclude=tuple()): def from_disk(self, path, *, exclude=tuple()):

View File

@ -38,7 +38,7 @@
cursor: pointer cursor: pointer
display: inline-block display: inline-block
padding: 0.35rem 0.5rem 0.25rem 0 padding: 0.35rem 0.5rem 0.25rem 0
margin: 0 1rem 0.75rem 0 margin: 0 1rem 0.5rem 0
font-size: var(--font-size-xs) font-size: var(--font-size-xs)
font-weight: bold font-weight: bold
@ -73,16 +73,19 @@
background: var(--color-theme) background: var(--color-theme)
.checkbox + &:before .checkbox + &:before
$size: 18px
content: "" content: ""
display: inline-block display: inline-block
width: 20px width: $size
height: 20px height: $size
border: 1px solid var(--color-subtle) border: 1px solid var(--color-subtle)
vertical-align: middle vertical-align: middle
margin-right: 0.5rem margin-right: 0.5rem
cursor: pointer cursor: pointer
border-radius: var(--border-radius) border-radius: $size / 4
background: var(--color-back) background: var(--color-back)
position: relative
top: -1px
.checkbox:checked + &:before .checkbox:checked + &:before
// Embed "check" icon here for simplicity // Embed "check" icon here for simplicity

View File

@ -4,6 +4,8 @@ import { StaticQuery, graphql } from 'gatsby'
import { Quickstart, QS } from '../components/quickstart' import { Quickstart, QS } from '../components/quickstart'
import { repo } from '../components/util' import { repo } from '../components/util'
const DEFAULT_MODELS = ['en']
const DEFAULT_OPT = 'efficiency'
const DEFAULT_HARDWARE = 'cpu' const DEFAULT_HARDWARE = 'cpu'
const DEFAULT_CUDA = 'cuda100' const DEFAULT_CUDA = 'cuda100'
const CUDA = { const CUDA = {
@ -68,9 +70,13 @@ const QuickstartInstall = ({ id, title }) => {
const [train, setTrain] = useState(false) const [train, setTrain] = useState(false)
const [hardware, setHardware] = useState(DEFAULT_HARDWARE) const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
const [cuda, setCuda] = useState(DEFAULT_CUDA) const [cuda, setCuda] = useState(DEFAULT_CUDA)
const [selectedModels, setModels] = useState(DEFAULT_MODELS)
const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
const setters = { const setters = {
hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)), hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)),
config: v => setTrain(v.includes('train')), config: v => setTrain(v.includes('train')),
models: setModels,
optimize: v => setEfficiency(v.includes('efficiency')),
} }
const showDropdown = { const showDropdown = {
hardware: () => hardware === 'gpu', hardware: () => hardware === 'gpu',
@ -89,13 +95,37 @@ const QuickstartInstall = ({ id, title }) => {
...DATA, ...DATA,
{ {
id: 'models', id: 'models',
title: 'Trained Pipelines', title: 'Trained pipelines',
multiple: true, multiple: true,
options: models options: models
.sort((a, b) => a.name.localeCompare(b.name)) .sort((a, b) => a.name.localeCompare(b.name))
.map(({ code, name }) => ({ id: code, title: name })), .map(({ code, name }) => ({
id: code,
title: name,
checked: DEFAULT_MODELS.includes(code),
})),
}, },
] ]
if (selectedModels.length) {
data.push({
id: 'optimize',
title: 'Select pipeline for',
options: [
{
id: 'efficiency',
title: 'efficiency',
checked: DEFAULT_OPT === 'efficiency',
help: 'Faster and smaller pipeline, but less accurate',
},
{
id: 'accuracy',
title: 'accuracy',
checked: DEFAULT_OPT === 'accuracy',
help: 'Larger and slower pipeline, but more accurate',
},
],
})
}
return ( return (
<Quickstart <Quickstart
data={data} data={data}
@ -149,11 +179,14 @@ const QuickstartInstall = ({ id, title }) => {
conda install -c conda-forge spacy-lookups-data conda install -c conda-forge spacy-lookups-data
</QS> </QS>
{models.map(({ code, models: modelOptions }) => ( {models.map(({ code, models: modelOptions }) => {
const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]
return (
<QS models={code} key={code}> <QS models={code} key={code}>
python -m spacy download {modelOptions[0]} python -m spacy download {pkg}
</QS> </QS>
))} )
})}
</Quickstart> </Quickstart>
) )
}} }}

View File

@ -31,25 +31,33 @@ const data = [
}, },
{ {
id: 'optimize', id: 'optimize',
title: 'Optimize for', title: 'Select for',
help:
'Optimize for efficiency (faster & smaller model) or higher accuracy (larger & slower model)',
options: [ options: [
{ id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' }, {
{ id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' }, id: 'efficiency',
title: 'efficiency',
checked: DEFAULT_OPT === 'efficiency',
help: 'Faster and smaller pipeline, but less accurate',
},
{
id: 'accuracy',
title: 'accuracy',
checked: DEFAULT_OPT === 'accuracy',
help: 'Larger and slower pipeline, but more accurate',
},
], ],
}, },
{ {
id: 'config', id: 'config',
title: 'Options', title: 'Options',
multiple: true, multiple: true,
options: [{ id: 'example', title: 'Show usage example' }], options: [{ id: 'example', title: 'Show text example' }],
}, },
] ]
const QuickstartInstall = ({ id, title, description, children }) => { const QuickstartInstall = ({ id, title, description, children }) => {
const [lang, setLang] = useState(DEFAULT_LANG) const [lang, setLang] = useState(DEFAULT_LANG)
const [efficiency, setEfficiency] = useState(DEFAULT_OPT) const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
const setters = { const setters = {
lang: setLang, lang: setLang,
optimize: v => setEfficiency(v.includes('efficiency')), optimize: v => setEfficiency(v.includes('efficiency')),