From 009ba14aafff1769bff408b2069e69245c441d2b Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 25 Sep 2020 15:47:10 +0200
Subject: [PATCH 1/8] Fix pretraining in train script (#6143)

* update pretraining API in train CLI

* bump thinc to 8.0.0a35

* bump to 3.0.0a26

* doc fixes

* small doc fix
---
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 setup.cfg                                     |  4 ++--
 spacy/about.py                                |  2 +-
 spacy/cli/train.py                            | 21 +++++++++----------
 spacy/errors.py                               |  2 +-
 spacy/training/corpus.py                      |  4 ++--
 website/docs/api/cli.md                       |  8 +++----
 website/docs/usage/embeddings-transformers.md |  2 +-
 website/docs/usage/training.md                |  2 +-
 10 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5290660aa..14d2c1e8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a34,<8.0.0a40",
+    "thinc>=8.0.0a35,<8.0.0a40",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index a8b237aa1..b3a95dcff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a34,<8.0.0a40
+thinc>=8.0.0a35,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 9831402d1..b080d4330 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a34,<8.0.0a40
+    thinc>=8.0.0a35,<8.0.0a40
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a34,<8.0.0a40
+    thinc>=8.0.0a35,<8.0.0a40
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.1.0,<3.0.0
diff --git a/spacy/about.py b/spacy/about.py
index ea9f9f33e..fbe772d25 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a25"
+__version__ = "3.0.0a26"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 6d61c2425..cbb0655ef 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -121,20 +121,19 @@ def train(
 
     # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
     if weights_data is not None:
-        tok2vec_path = config["pretraining"].get("tok2vec_model", None)
-        if tok2vec_path is None:
+        tok2vec_component = config["pretraining"]["component"]
+        if tok2vec_component is None:
             msg.fail(
-                f"To pretrained tok2vec weights, the config needs to specify which "
-                f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
+                f"To use pretrained tok2vec weights, [pretraining.component] "
+                f"needs to specify the component that should load them.",
                 exits=1,
             )
-        tok2vec = config
-        for subpath in tok2vec_path.split("."):
-            tok2vec = tok2vec.get(subpath)
-        if not tok2vec:
-            err = f"Could not locate the tok2vec model at {tok2vec_path}"
-            msg.fail(err, exits=1)
-        tok2vec.from_bytes(weights_data)
+        layer = nlp.get_pipe(tok2vec_component).model
+        tok2vec_layer = config["pretraining"]["layer"]
+        if tok2vec_layer:
+            layer = layer.get_ref(tok2vec_layer)
+        layer.from_bytes(weights_data)
+        msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
 
     # Create iterator, which yields out info after each optimization step.
     msg.info("Start training")
diff --git a/spacy/errors.py b/spacy/errors.py
index 4216e3936..640419182 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -85,7 +85,7 @@ class Warnings:
             "attribute or operator.")
 
     # TODO: fix numbering after merging develop into master
-    W090 = ("Could not locate any binary .spacy files in path '{path}'.")
+    W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
     W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
     W093 = ("Could not find any data to train the {name} on. Is your "
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 11f098993..848692f47 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -49,7 +49,7 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
         elif path.parts[-1].endswith(file_type):
             locs.append(path)
     if len(locs) == 0:
-        warnings.warn(Warnings.W090.format(path=orig_path))
+        warnings.warn(Warnings.W090.format(path=orig_path, format=file_type))
     return locs
 
 
@@ -200,7 +200,7 @@ class JsonlTexts:
 
         DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
         """
-        for loc in walk_corpus(self.path, "jsonl"):
+        for loc in walk_corpus(self.path, ".jsonl"):
             records = srsly.read_jsonl(loc)
             for record in records:
                 doc = nlp.make_doc(record["text"])
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 53cd954be..a6cb41e5e 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -754,7 +754,7 @@ in the section `[paths]`.
 </Infobox>
 
 ```cli
-$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides]
+$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
 ```
 
 | Name              | Description                                                                                                                                                                                |
@@ -778,8 +778,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
 then include a **path to one of these pretrained weights files** in your
 [training config](/usage/training#config) as the `init_tok2vec` setting when you
 train your pipeline. This technique may be especially helpful if you have little
-labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
-for more info.
+labelled data. See the usage docs on
+[pretraining](/usage/embeddings-transformers#pretraining) for more info.
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -794,7 +794,7 @@ auto-generated by setting `--pretraining` on
 </Infobox>
 
 ```cli
-$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [overrides]
+$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
 ```
 
 | Name                    | Description                                                                                                                                                                           |
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index b00760e62..97249bfb2 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -752,7 +752,7 @@ network to model something about word cooccurrence statistics. Predicting
 leading and trailing characters does that more than adequately, as the exact
 word sequence could be recovered with high accuracy if the initial and trailing
 characters are predicted accurately. With the vectors objective, the pretraining
-is use the embedding space learned by an algorithm such as
+uses the embedding space learned by an algorithm such as
 [GloVe](https://nlp.stanford.edu/projects/glove/) or
 [Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to
 focus on the contextual modelling we actual care about.
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 65afd0eb4..54be6b367 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -175,7 +175,7 @@ sections of a config file are:
 | `paths`       | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI.          |
 | `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
 | `training`    | Settings and controls for the training and evaluation process.                                                                                                  |
-| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining).                                                                              |
+| `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining).                                                |
 
 <Infobox title="Config format and settings" emoji="📖">
 

From c3b5a3cfff2f2e168073d3935afb3fe005f11627 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 25 Sep 2020 15:56:48 +0200
Subject: [PATCH 2/8] Clean up MorphAnalysisC struct (#6146)

---
 spacy/structs.pxd | 43 -------------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index a01244d7e..4a51bc9e0 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -60,49 +60,6 @@ cdef struct MorphAnalysisC:
     hash_t key
     int length
 
-    attr_t abbr
-    attr_t adp_type
-    attr_t adv_type
-    attr_t animacy
-    attr_t aspect
-    attr_t case
-    attr_t conj_type
-    attr_t connegative
-    attr_t definite
-    attr_t degree
-    attr_t derivation
-    attr_t echo
-    attr_t foreign
-    attr_t gender
-    attr_t hyph
-    attr_t inf_form
-    attr_t mood
-    attr_t negative
-    attr_t number
-    attr_t name_type
-    attr_t noun_type
-    attr_t num_form
-    attr_t num_type
-    attr_t num_value
-    attr_t part_form
-    attr_t part_type
-    attr_t person
-    attr_t polite
-    attr_t polarity
-    attr_t poss
-    attr_t prefix
-    attr_t prep_case
-    attr_t pron_type
-    attr_t punct_side
-    attr_t punct_type
-    attr_t reflex
-    attr_t style
-    attr_t style_variant
-    attr_t tense
-    attr_t typo
-    attr_t verb_form
-    attr_t voice
-    attr_t verb_type
     attr_t* fields
     attr_t* features
 

From 3d8388969e2eede035b2b52db999a99e0fd675f8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 25 Sep 2020 19:07:26 +0200
Subject: [PATCH 3/8] Sort paths for cache consistency

---
 spacy/cli/convert.py     | 2 ++
 spacy/training/corpus.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 8f8234c61..3fc530822 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -209,6 +209,8 @@ def walk_directory(path: Path, converter: str) -> List[Path]:
             continue
         else:
             locs.append(path)
+    # It's good to sort these, in case the ordering messes up cache.
+    locs.sort()
     return locs
 
 
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 848692f47..12bda486e 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -50,6 +50,8 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
             locs.append(path)
     if len(locs) == 0:
         warnings.warn(Warnings.W090.format(path=orig_path, format=file_type))
+    # It's good to sort these, in case the ordering messes up a cache.
+    locs.sort()
     return locs
 
 

From 26afd3bd90ca175a20b1e8f52abec898655c7fd3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 25 Sep 2020 21:47:22 +0200
Subject: [PATCH 4/8] Fix iteration order

---
 spacy/training/gold_io.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 524da0a16..8b9f5ab2b 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -62,7 +62,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
     """Read Example dictionaries from a json file or directory."""
     loc = util.ensure_path(loc)
     if loc.is_dir():
-        for filename in loc.iterdir():
+        for filename in sorted(loc.iterdir()):
             yield from read_json_file(loc / filename, limit=limit)
     else:
         with loc.open("rb") as file_:

From 092ce4648e959453cbc25843f7d9afcb234b540e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 25 Sep 2020 22:20:44 +0200
Subject: [PATCH 5/8] Make DocBin output stable data (set iteration)

---
 spacy/tokens/_serialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index c9a20f6c0..2d4e9af9d 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -171,7 +171,7 @@ class DocBin:
             "tokens": tokens.tobytes("C"),
             "spaces": spaces.tobytes("C"),
             "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
-            "strings": list(self.strings),
+            "strings": list(sorted(self.strings)),
             "cats": self.cats,
             "flags": self.flags,
         }

From 98327f66a9e66366ca3ee99083a5cfd9acfe8d7a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 25 Sep 2020 23:20:50 +0200
Subject: [PATCH 6/8] Fix attributeruler key

---
 spacy/pipeline/attributeruler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 0d59a1ba0..52f8b7ece 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -173,7 +173,9 @@ class AttributeRuler(Pipe):
 
         DOCS: https://nightly.spacy.io/api/attributeruler#add
         """
-        self.matcher.add(len(self.attrs), patterns)
+        # This needs to be a string, because otherwise it's interpreted as a
+        # string key.
+        self.matcher.add(f"attr_rules_{len(self.attrs)}", patterns)
         self._attrs_unnormed.append(attrs)
         attrs = normalize_token_attrs(self.vocab, attrs)
         self.attrs.append(attrs)

From 821f37254cf1caca8f943574b4cbaaaea4cfb251 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 26 Sep 2020 00:19:53 +0200
Subject: [PATCH 7/8] Fix attributeruler

---
 spacy/pipeline/attributeruler.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 52f8b7ece..e1ad91340 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -80,11 +80,14 @@ class AttributeRuler(Pipe):
         DOCS: https://nightly.spacy.io/api/attributeruler#call
         """
         matches = sorted(self.matcher(doc, allow_missing=True))
+        print("Attrs", self.attrs)
+        print("Matches", matches)
 
         for match_id, start, end in matches:
             span = Span(doc, start, end, label=match_id)
-            attrs = self.attrs[span.label]
-            index = self.indices[span.label]
+            attr_id = _parse_key(span.label_)
+            attrs = self.attrs[attr_id]
+            index = self.indices[attr_id]
             try:
                 token = span[index]
             except IndexError:
@@ -173,9 +176,10 @@ class AttributeRuler(Pipe):
 
         DOCS: https://nightly.spacy.io/api/attributeruler#add
         """
-        # This needs to be a string, because otherwise it's interpreted as a
-        # string key.
-        self.matcher.add(f"attr_rules_{len(self.attrs)}", patterns)
+        # We need to make a string here, because otherwise the ID we pass back
+        # will be interpreted as the hash of a string, rather than an ordinal.
+        key = _make_key(len(self.attrs))
+        self.matcher.add(self.vocab.strings.add(key), patterns)
         self._attrs_unnormed.append(attrs)
         attrs = normalize_token_attrs(self.vocab, attrs)
         self.attrs.append(attrs)
@@ -199,7 +203,7 @@ class AttributeRuler(Pipe):
         all_patterns = []
         for i in range(len(self.attrs)):
             p = {}
-            p["patterns"] = self.matcher.get(i)[1]
+            p["patterns"] = self.matcher.get(_make_key(i))[1]
             p["attrs"] = self._attrs_unnormed[i]
             p["index"] = self.indices[i]
             all_patterns.append(p)
@@ -303,6 +307,12 @@ class AttributeRuler(Pipe):
 
         return self
 
+def _make_key(n_attr):
+    return f"attr_rule_{n_attr}"
+
+def _parse_key(key):
+    return int(key.rsplit("_", 1)[1])
+
 
 def _split_morph_attrs(attrs):
     """Split entries from a tag map or morph rules dict into to two dicts, one

From 702edf52a0dcef071b49e0b52af7de6cfc9be140 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 26 Sep 2020 00:23:09 +0200
Subject: [PATCH 8/8] Fix attributeruler

---
 spacy/pipeline/attributeruler.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index e1ad91340..1dc2a10dd 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -79,26 +79,32 @@ class AttributeRuler(Pipe):
 
         DOCS: https://nightly.spacy.io/api/attributeruler#call
         """
-        matches = sorted(self.matcher(doc, allow_missing=True))
-        print("Attrs", self.attrs)
-        print("Matches", matches)
-
-        for match_id, start, end in matches:
+        matches = self.matcher(doc, allow_missing=True)
+        # Sort by the attribute ID, so that later rules have precendence
+        matches = [
+            (_parse_key(self.vocab.strings[m_id]), m_id, s, e)
+            for m_id, s, e in matches
+        ]
+        matches.sort()
+        for attr_id, match_id, start, end in matches:
             span = Span(doc, start, end, label=match_id)
-            attr_id = _parse_key(span.label_)
             attrs = self.attrs[attr_id]
             index = self.indices[attr_id]
             try:
+                # The index can be negative, which makes it annoying to do
+                # the boundscheck. Let Span do it instead.
                 token = span[index]
             except IndexError:
+                # The original exception is just our conditional logic, so we
+                # raise from.
                 raise ValueError(
                     Errors.E1001.format(
                         patterns=self.matcher.get(span.label),
                         span=[t.text for t in span],
                         index=index,
                     )
-                ) from None
-            set_token_attrs(token, attrs)
+                ) from None 
+            set_token_attrs(span[index], attrs)
         return doc
 
     def pipe(self, stream, *, batch_size=128):