Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-09-26 12:18:16 +02:00
commit 452902c74a
12 changed files with 51 additions and 73 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a34,<8.0.0a40", "thinc>=8.0.0a35,<8.0.0a40",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations", "pytokenizations",
"pathy" "pathy"

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a34,<8.0.0a40 thinc>=8.0.0a35,<8.0.0a40
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets==0.2.0a0 ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a34,<8.0.0a40 thinc>=8.0.0a35,<8.0.0a40
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a34,<8.0.0a40 thinc>=8.0.0a35,<8.0.0a40
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a25" __version__ = "3.0.0a26"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -209,6 +209,8 @@ def walk_directory(path: Path, converter: str) -> List[Path]:
continue continue
else: else:
locs.append(path) locs.append(path)
# It's good to sort these, in case the ordering messes up cache.
locs.sort()
return locs return locs

View File

@ -121,20 +121,19 @@ def train(
# Load pretrained tok2vec weights - cf. CLI command 'pretrain' # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
if weights_data is not None: if weights_data is not None:
tok2vec_path = config["pretraining"].get("tok2vec_model", None) tok2vec_component = config["pretraining"]["component"]
if tok2vec_path is None: if tok2vec_component is None:
msg.fail( msg.fail(
f"To pretrained tok2vec weights, the config needs to specify which " f"To use pretrained tok2vec weights, [pretraining.component] "
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].", f"needs to specify the component that should load them.",
exits=1, exits=1,
) )
tok2vec = config layer = nlp.get_pipe(tok2vec_component).model
for subpath in tok2vec_path.split("."): tok2vec_layer = config["pretraining"]["layer"]
tok2vec = tok2vec.get(subpath) if tok2vec_layer:
if not tok2vec: layer = layer.get_ref(tok2vec_layer)
err = f"Could not locate the tok2vec model at {tok2vec_path}" layer.from_bytes(weights_data)
msg.fail(err, exits=1) msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
tok2vec.from_bytes(weights_data)
# Create iterator, which yields out info after each optimization step. # Create iterator, which yields out info after each optimization step.
msg.info("Start training") msg.info("Start training")

View File

@ -85,7 +85,7 @@ class Warnings:
"attribute or operator.") "attribute or operator.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
W090 = ("Could not locate any binary .spacy files in path '{path}'.") W090 = ("Could not locate any {format} files in path '{path}'.")
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
W093 = ("Could not find any data to train the {name} on. Is your " W093 = ("Could not find any data to train the {name} on. Is your "

View File

@ -79,23 +79,32 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#call DOCS: https://nightly.spacy.io/api/attributeruler#call
""" """
matches = sorted(self.matcher(doc, allow_missing=True)) matches = self.matcher(doc, allow_missing=True)
# Sort by the attribute ID, so that later rules have precendence
for match_id, start, end in matches: matches = [
(_parse_key(self.vocab.strings[m_id]), m_id, s, e)
for m_id, s, e in matches
]
matches.sort()
for attr_id, match_id, start, end in matches:
span = Span(doc, start, end, label=match_id) span = Span(doc, start, end, label=match_id)
attrs = self.attrs[span.label] attrs = self.attrs[attr_id]
index = self.indices[span.label] index = self.indices[attr_id]
try: try:
# The index can be negative, which makes it annoying to do
# the boundscheck. Let Span do it instead.
token = span[index] token = span[index]
except IndexError: except IndexError:
# The original exception is just our conditional logic, so we
# raise from.
raise ValueError( raise ValueError(
Errors.E1001.format( Errors.E1001.format(
patterns=self.matcher.get(span.label), patterns=self.matcher.get(span.label),
span=[t.text for t in span], span=[t.text for t in span],
index=index, index=index,
) )
) from None ) from None
set_token_attrs(token, attrs) set_token_attrs(span[index], attrs)
return doc return doc
def pipe(self, stream, *, batch_size=128): def pipe(self, stream, *, batch_size=128):
@ -173,7 +182,10 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#add DOCS: https://nightly.spacy.io/api/attributeruler#add
""" """
self.matcher.add(len(self.attrs), patterns) # We need to make a string here, because otherwise the ID we pass back
# will be interpreted as the hash of a string, rather than an ordinal.
key = _make_key(len(self.attrs))
self.matcher.add(self.vocab.strings.add(key), patterns)
self._attrs_unnormed.append(attrs) self._attrs_unnormed.append(attrs)
attrs = normalize_token_attrs(self.vocab, attrs) attrs = normalize_token_attrs(self.vocab, attrs)
self.attrs.append(attrs) self.attrs.append(attrs)
@ -197,7 +209,7 @@ class AttributeRuler(Pipe):
all_patterns = [] all_patterns = []
for i in range(len(self.attrs)): for i in range(len(self.attrs)):
p = {} p = {}
p["patterns"] = self.matcher.get(i)[1] p["patterns"] = self.matcher.get(_make_key(i))[1]
p["attrs"] = self._attrs_unnormed[i] p["attrs"] = self._attrs_unnormed[i]
p["index"] = self.indices[i] p["index"] = self.indices[i]
all_patterns.append(p) all_patterns.append(p)
@ -301,6 +313,12 @@ class AttributeRuler(Pipe):
return self return self
def _make_key(n_attr):
return f"attr_rule_{n_attr}"
def _parse_key(key):
return int(key.rsplit("_", 1)[1])
def _split_morph_attrs(attrs): def _split_morph_attrs(attrs):
"""Split entries from a tag map or morph rules dict into to two dicts, one """Split entries from a tag map or morph rules dict into to two dicts, one

View File

@ -60,49 +60,6 @@ cdef struct MorphAnalysisC:
hash_t key hash_t key
int length int length
attr_t abbr
attr_t adp_type
attr_t adv_type
attr_t animacy
attr_t aspect
attr_t case
attr_t conj_type
attr_t connegative
attr_t definite
attr_t degree
attr_t derivation
attr_t echo
attr_t foreign
attr_t gender
attr_t hyph
attr_t inf_form
attr_t mood
attr_t negative
attr_t number
attr_t name_type
attr_t noun_type
attr_t num_form
attr_t num_type
attr_t num_value
attr_t part_form
attr_t part_type
attr_t person
attr_t polite
attr_t polarity
attr_t poss
attr_t prefix
attr_t prep_case
attr_t pron_type
attr_t punct_side
attr_t punct_type
attr_t reflex
attr_t style
attr_t style_variant
attr_t tense
attr_t typo
attr_t verb_form
attr_t voice
attr_t verb_type
attr_t* fields attr_t* fields
attr_t* features attr_t* features

View File

@ -171,7 +171,7 @@ class DocBin:
"tokens": tokens.tobytes("C"), "tokens": tokens.tobytes("C"),
"spaces": spaces.tobytes("C"), "spaces": spaces.tobytes("C"),
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
"strings": list(self.strings), "strings": list(sorted(self.strings)),
"cats": self.cats, "cats": self.cats,
"flags": self.flags, "flags": self.flags,
} }

View File

@ -49,7 +49,9 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
elif path.parts[-1].endswith(file_type): elif path.parts[-1].endswith(file_type):
locs.append(path) locs.append(path)
if len(locs) == 0: if len(locs) == 0:
warnings.warn(Warnings.W090.format(path=orig_path)) warnings.warn(Warnings.W090.format(path=orig_path, format=file_type))
# It's good to sort these, in case the ordering messes up a cache.
locs.sort()
return locs return locs
@ -200,7 +202,7 @@ class JsonlTexts:
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
""" """
for loc in walk_corpus(self.path, "jsonl"): for loc in walk_corpus(self.path, ".jsonl"):
records = srsly.read_jsonl(loc) records = srsly.read_jsonl(loc)
for record in records: for record in records:
doc = nlp.make_doc(record["text"]) doc = nlp.make_doc(record["text"])

View File

@ -62,7 +62,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
"""Read Example dictionaries from a json file or directory.""" """Read Example dictionaries from a json file or directory."""
loc = util.ensure_path(loc) loc = util.ensure_path(loc)
if loc.is_dir(): if loc.is_dir():
for filename in loc.iterdir(): for filename in sorted(loc.iterdir()):
yield from read_json_file(loc / filename, limit=limit) yield from read_json_file(loc / filename, limit=limit)
else: else:
with loc.open("rb") as file_: with loc.open("rb") as file_: