diff --git a/pyproject.toml b/pyproject.toml index 5290660aa..14d2c1e8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a34,<8.0.0a40", + "thinc>=8.0.0a35,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index a8b237aa1..b3a95dcff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a34,<8.0.0a40 +thinc>=8.0.0a35,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 9831402d1..b080d4330 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a34,<8.0.0a40 + thinc>=8.0.0a35,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a34,<8.0.0a40 + thinc>=8.0.0a35,<8.0.0a40 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/about.py b/spacy/about.py index ea9f9f33e..fbe772d25 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a25" +__version__ = "3.0.0a26" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 8f8234c61..3fc530822 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -209,6 +209,8 @@ def walk_directory(path: Path, converter: str) -> List[Path]: continue else: locs.append(path) + # It's good to sort these, in case the ordering messes up cache. + locs.sort() return locs diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6d61c2425..cbb0655ef 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -121,20 +121,19 @@ def train( # Load pretrained tok2vec weights - cf. CLI command 'pretrain' if weights_data is not None: - tok2vec_path = config["pretraining"].get("tok2vec_model", None) - if tok2vec_path is None: + tok2vec_component = config["pretraining"]["component"] + if tok2vec_component is None: msg.fail( - f"To pretrained tok2vec weights, the config needs to specify which " - f"tok2vec layer to load in the setting [pretraining.tok2vec_model].", + f"To use pretrained tok2vec weights, [pretraining.component] " + f"needs to specify the component that should load them.", exits=1, ) - tok2vec = config - for subpath in tok2vec_path.split("."): - tok2vec = tok2vec.get(subpath) - if not tok2vec: - err = f"Could not locate the tok2vec model at {tok2vec_path}" - msg.fail(err, exits=1) - tok2vec.from_bytes(weights_data) + layer = nlp.get_pipe(tok2vec_component).model + tok2vec_layer = config["pretraining"]["layer"] + if tok2vec_layer: + layer = layer.get_ref(tok2vec_layer) + layer.from_bytes(weights_data) + msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'") # Create iterator, which yields out info after each optimization step. msg.info("Start training") diff --git a/spacy/errors.py b/spacy/errors.py index 4216e3936..640419182 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -85,7 +85,7 @@ class Warnings: "attribute or operator.") # TODO: fix numbering after merging develop into master - W090 = ("Could not locate any binary .spacy files in path '{path}'.") + W090 = ("Could not locate any {format} files in path '{path}'.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W093 = ("Could not find any data to train the {name} on. Is your " diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 0d59a1ba0..1dc2a10dd 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -79,23 +79,32 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/attributeruler#call """ - matches = sorted(self.matcher(doc, allow_missing=True)) - - for match_id, start, end in matches: + matches = self.matcher(doc, allow_missing=True) + # Sort by the attribute ID, so that later rules have precendence + matches = [ + (_parse_key(self.vocab.strings[m_id]), m_id, s, e) + for m_id, s, e in matches + ] + matches.sort() + for attr_id, match_id, start, end in matches: span = Span(doc, start, end, label=match_id) - attrs = self.attrs[span.label] - index = self.indices[span.label] + attrs = self.attrs[attr_id] + index = self.indices[attr_id] try: + # The index can be negative, which makes it annoying to do + # the boundscheck. Let Span do it instead. token = span[index] except IndexError: + # The original exception is just our conditional logic, so we + # raise from. raise ValueError( Errors.E1001.format( patterns=self.matcher.get(span.label), span=[t.text for t in span], index=index, ) - ) from None - set_token_attrs(token, attrs) + ) from None + set_token_attrs(span[index], attrs) return doc def pipe(self, stream, *, batch_size=128): @@ -173,7 +182,10 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/attributeruler#add """ - self.matcher.add(len(self.attrs), patterns) + # We need to make a string here, because otherwise the ID we pass back + # will be interpreted as the hash of a string, rather than an ordinal. + key = _make_key(len(self.attrs)) + self.matcher.add(self.vocab.strings.add(key), patterns) self._attrs_unnormed.append(attrs) attrs = normalize_token_attrs(self.vocab, attrs) self.attrs.append(attrs) @@ -197,7 +209,7 @@ class AttributeRuler(Pipe): all_patterns = [] for i in range(len(self.attrs)): p = {} - p["patterns"] = self.matcher.get(i)[1] + p["patterns"] = self.matcher.get(_make_key(i))[1] p["attrs"] = self._attrs_unnormed[i] p["index"] = self.indices[i] all_patterns.append(p) @@ -301,6 +313,12 @@ class AttributeRuler(Pipe): return self +def _make_key(n_attr): + return f"attr_rule_{n_attr}" + +def _parse_key(key): + return int(key.rsplit("_", 1)[1]) + def _split_morph_attrs(attrs): """Split entries from a tag map or morph rules dict into to two dicts, one diff --git a/spacy/structs.pxd b/spacy/structs.pxd index a01244d7e..4a51bc9e0 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -60,49 +60,6 @@ cdef struct MorphAnalysisC: hash_t key int length - attr_t abbr - attr_t adp_type - attr_t adv_type - attr_t animacy - attr_t aspect - attr_t case - attr_t conj_type - attr_t connegative - attr_t definite - attr_t degree - attr_t derivation - attr_t echo - attr_t foreign - attr_t gender - attr_t hyph - attr_t inf_form - attr_t mood - attr_t negative - attr_t number - attr_t name_type - attr_t noun_type - attr_t num_form - attr_t num_type - attr_t num_value - attr_t part_form - attr_t part_type - attr_t person - attr_t polite - attr_t polarity - attr_t poss - attr_t prefix - attr_t prep_case - attr_t pron_type - attr_t punct_side - attr_t punct_type - attr_t reflex - attr_t style - attr_t style_variant - attr_t tense - attr_t typo - attr_t verb_form - attr_t voice - attr_t verb_type attr_t* fields attr_t* features diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index c9a20f6c0..2d4e9af9d 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -171,7 +171,7 @@ class DocBin: "tokens": tokens.tobytes("C"), "spaces": spaces.tobytes("C"), "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), - "strings": list(self.strings), + "strings": list(sorted(self.strings)), "cats": self.cats, "flags": self.flags, } diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 11f098993..12bda486e 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -49,7 +49,9 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]: elif path.parts[-1].endswith(file_type): locs.append(path) if len(locs) == 0: - warnings.warn(Warnings.W090.format(path=orig_path)) + warnings.warn(Warnings.W090.format(path=orig_path, format=file_type)) + # It's good to sort these, in case the ordering messes up a cache. + locs.sort() return locs @@ -200,7 +202,7 @@ class JsonlTexts: DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call """ - for loc in walk_corpus(self.path, "jsonl"): + for loc in walk_corpus(self.path, ".jsonl"): records = srsly.read_jsonl(loc) for record in records: doc = nlp.make_doc(record["text"]) diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx index 524da0a16..8b9f5ab2b 100644 --- a/spacy/training/gold_io.pyx +++ b/spacy/training/gold_io.pyx @@ -62,7 +62,7 @@ def read_json_file(loc, docs_filter=None, limit=None): """Read Example dictionaries from a json file or directory.""" loc = util.ensure_path(loc) if loc.is_dir(): - for filename in loc.iterdir(): + for filename in sorted(loc.iterdir()): yield from read_json_file(loc / filename, limit=limit) else: with loc.open("rb") as file_: