Merge pull request #10100 from svlandeg/feature/master_copy

Update develop with latest from master (2)
2025-07-18 04:02:20 +03:00 · 2022-01-20 14:29:50 +01:00 · 2022-01-20 14:29:50 +01:00 · d2afdfefc2
commit d2afdfefc2
parent 6984f55277 4465fe0306
31 changed files with 331 additions and 91 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -108,8 +108,8 @@ apple =
    thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
-    sudachipy>=0.4.9
-    sudachidict_core>=20200330
+    sudachipy>=0.5.2,!=0.6.1
+    sudachidict_core>=20211220
 ko =
    natto-py==0.9.0
 th =
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -1,3 +1,6 @@
+from .errors import Errors
+
+IOB_STRINGS = ("", "I", "O", "B")

 IDS = {
    "": NULL_ATTR,
@ -64,7 +67,6 @@ IDS = {
    "FLAG61": FLAG61,
    "FLAG62": FLAG62,
    "FLAG63": FLAG63,
-
    "ID": ID,
    "ORTH": ORTH,
    "LOWER": LOWER,
@ -72,7 +74,6 @@ IDS = {
    "SHAPE": SHAPE,
    "PREFIX": PREFIX,
    "SUFFIX": SUFFIX,
-
    "LENGTH": LENGTH,
    "LEMMA": LEMMA,
    "POS": POS,
@ -87,7 +88,7 @@ IDS = {
    "SPACY": SPACY,
    "LANG": LANG,
    "MORPH": MORPH,
-    "IDX": IDX
+    "IDX": IDX,
 }


@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    """
    inty_attrs = {}
    if _do_deprecated:
-        if 'F' in stringy_attrs:
+        if "F" in stringy_attrs:
            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
-        if 'L' in stringy_attrs:
+        if "L" in stringy_attrs:
            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
-        if 'pos' in stringy_attrs:
+        if "pos" in stringy_attrs:
            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
-        if 'morph' in stringy_attrs:
-            morphs = stringy_attrs.pop('morph')
-        if 'number' in stringy_attrs:
-            stringy_attrs.pop('number')
-        if 'tenspect' in stringy_attrs:
-            stringy_attrs.pop('tenspect')
+        if "morph" in stringy_attrs:
+            morphs = stringy_attrs.pop("morph")
+        if "number" in stringy_attrs:
+            stringy_attrs.pop("number")
+        if "tenspect" in stringy_attrs:
+            stringy_attrs.pop("tenspect")
        morph_keys = [
-            'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
-            'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
-            'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
-            'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
-            'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
-            'NumValue', 'PartType', 'Polite', 'StyleVariant',
-            'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
-            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
-            'Polarity', 'PrepCase', 'Animacy' # U20
+            "PunctType",
+            "PunctSide",
+            "Other",
+            "Degree",
+            "AdvType",
+            "Number",
+            "VerbForm",
+            "PronType",
+            "Aspect",
+            "Tense",
+            "PartType",
+            "Poss",
+            "Hyph",
+            "ConjType",
+            "NumType",
+            "Foreign",
+            "VerbType",
+            "NounType",
+            "Gender",
+            "Mood",
+            "Negative",
+            "Tense",
+            "Voice",
+            "Abbr",
+            "Derivation",
+            "Echo",
+            "Foreign",
+            "NameType",
+            "NounType",
+            "NumForm",
+            "NumValue",
+            "PartType",
+            "Polite",
+            "StyleVariant",
+            "PronType",
+            "AdjType",
+            "Person",
+            "Variant",
+            "AdpType",
+            "Reflex",
+            "Negative",
+            "Mood",
+            "Aspect",
+            "Case",
+            "Polarity",
+            "PrepCase",
+            "Animacy",  # U20
        ]
        for key in morph_keys:
            if key in stringy_attrs:
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
+            if int_key == ENT_IOB:
+                if value in IOB_STRINGS:
+                    value = IOB_STRINGS.index(value)
+                elif isinstance(value, str):
+                    raise ValueError(Errors.E1025.format(value=value))
            if strings_map is not None and isinstance(value, str):
-                if hasattr(strings_map, 'add'):
+                if hasattr(strings_map, "add"):
                    value = strings_map.add(value)
                else:
                    value = strings_map[value]
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
 from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..pipeline._parser_internals.nonproj import DELIMITER
-from ..pipeline import Morphologizer
+from ..pipeline import Morphologizer, SpanCategorizer
 from ..morphology import Morphology
 from ..language import Language
 from ..util import registry, resolve_dot_names
@ -699,8 +699,34 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
    return count


-def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
-    if pipe_name not in nlp.pipe_names:
-        return set()
+def _get_labels_from_model(
+    nlp: Language, factory_name: str
+) -> Set[str]:
+    pipe_names = [
+        pipe_name
+        for pipe_name in nlp.pipe_names
+        if nlp.get_pipe_meta(pipe_name).factory == factory_name
+    ]
+    labels: Set[str] = set()
+    for pipe_name in pipe_names:
        pipe = nlp.get_pipe(pipe_name)
-    return set(pipe.labels)
+        labels.update(pipe.labels)
+    return labels
+
+
+def _get_labels_from_spancat(
+    nlp: Language
+) -> Dict[str, Set[str]]:
+    pipe_names = [
+        pipe_name
+        for pipe_name in nlp.pipe_names
+        if nlp.get_pipe_meta(pipe_name).factory == "spancat"
+    ]
+    labels: Dict[str, Set[str]] = {}
+    for pipe_name in pipe_names:
+        pipe = nlp.get_pipe(pipe_name)
+        assert isinstance(pipe, SpanCategorizer)
+        if pipe.key not in labels:
+            labels[pipe.key] = set()
+        labels[pipe.key].update(pipe.labels)
+    return labels
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
    "LOC": "#ff9561",
    "PERSON": "#aa9cfc",
    "NORP": "#c887fb",
-    "FACILITY": "#9cc9cc",
+    "FAC": "#9cc9cc",
    "EVENT": "#ffeb80",
    "LAW": "#ff8197",
    "LANGUAGE": "#ff8197",
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -888,9 +888,12 @@ class Errors(metaclass=ErrorsWithCodes):
    E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
             "Non-UD tags should use the `tag` property.")
    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
-    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
-    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
-    
+    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
+             "exist.")
+    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
+             "patterns.")
+    E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
+             "supported values are: 'I', 'O', 'B' and ''")
    

 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1285,9 +1285,9 @@ class Language:
            )
        except IOError:
            raise IOError(Errors.E884.format(vectors=I["vectors"]))
-        if self.vocab.vectors.data.shape[1] >= 1:
+        if self.vocab.vectors.shape[1] >= 1:
            ops = get_current_ops()
-            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+            self.vocab.vectors.to_ops(ops)
        if hasattr(self.tokenizer, "initialize"):
            tok_settings = validate_init_settings(
                self.tokenizer.initialize,  # type: ignore[union-attr]
@ -1332,8 +1332,8 @@ class Language:
        DOCS: https://spacy.io/api/language#resume_training
        """
        ops = get_current_ops()
-        if self.vocab.vectors.data.shape[1] >= 1:
-            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+        if self.vocab.vectors.shape[1] >= 1:
+            self.vocab.vectors.to_ops(ops)
        for name, proc in self.pipeline:
            if hasattr(proc, "_rehearsal_model"):
                proc._rehearsal_model = deepcopy(proc.model)  # type: ignore[attr-defined]
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -130,7 +130,9 @@ cdef class Lexeme:
            return 0.0
        vector = self.vector
        xp = get_array_module(vector)
-        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
    
    @property
    def has_vector(self):
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB

 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@ -798,6 +798,9 @@ def _get_attr_values(spec, string_store):
                attr = "SENT_START"
            attr = IDS.get(attr)
        if isinstance(value, str):
+            if attr == ENT_IOB and value in Token.iob_strings():
+                value = Token.iob_strings().index(value)
+            else:
                value = string_store.add(value)
        elif isinstance(value, bool):
            value = int(value)
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -23,7 +23,7 @@ def create_pretrain_vectors(
    maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
    def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
-        if vocab.vectors.data.shape[1] == 0:
+        if vocab.vectors.shape[1] == 0:
            raise ValueError(Errors.E875)
        model = build_cloze_multi_task_model(
            vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
@ -116,7 +116,7 @@ def build_multi_task_model(
 def build_cloze_multi_task_model(
    vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
 ) -> Model:
-    nO = vocab.vectors.data.shape[1]
+    nO = vocab.vectors.shape[1]
    output_layer = chain(
        cast(Model[List["Floats2d"], Floats2d], list2array()),
        Maxout(
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@ -94,7 +94,7 @@ def init(
    nM = model.get_dim("nM") if model.has_dim("nM") else None
    nO = model.get_dim("nO") if model.has_dim("nO") else None
    if X is not None and len(X):
-        nM = X[0].vocab.vectors.data.shape[1]
+        nM = X[0].vocab.vectors.shape[1]
    if Y is not None:
        nO = Y.data.shape[1]

--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -1,5 +1,6 @@
 from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
 from typing import Iterable, TypeVar, TYPE_CHECKING
+from .compat import Literal
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator, create_model
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
 UnderscoreValue = Union[
    TokenPatternString, TokenPatternNumber, str, int, float, list, bool
 ]
+IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]


 class TokenPattern(BaseModel):
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
    lemma: Optional[StringValue] = None
    shape: Optional[StringValue] = None
    ent_type: Optional[StringValue] = None
+    ent_iob: Optional[IobValue] = None
    ent_id: Optional[StringValue] = None
    ent_kb_id: Optional[StringValue] = None
    norm: Optional[StringValue] = None
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
        "Merging the docs is fun.",
        "",
        "They don't think alike. ",
+        "",
        "Another doc.",
    ]
    en_texts_without_empty = [t for t in en_texts if len(t)]
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    en_docs = [en_tokenizer(text) for text in en_texts]
    en_docs[0].spans["group"] = [en_docs[0][1:4]]
    en_docs[2].spans["group"] = [en_docs[2][1:4]]
-    en_docs[3].spans["group"] = [en_docs[3][0:1]]
+    en_docs[4].spans["group"] = [en_docs[4][0:1]]
    span_group_texts = sorted(
-        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
+        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
    )
    de_doc = de_tokenizer(de_text)
    Token.set_extension("is_ambiguous", default=False)
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@ -1,4 +1,5 @@
 import pytest
+from spacy.attrs import intify_attrs, ENT_IOB

 from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
 from spacy.lang.en.stop_words import STOP_WORDS
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
    assert int_attrs == {ORTH: 10, IS_ALPHA: True}


+def test_attrs_ent_iob_intify():
+    int_attrs = intify_attrs({"ENT_IOB": ""})
+    assert int_attrs == {ENT_IOB: 0}
+
+    int_attrs = intify_attrs({"ENT_IOB": "I"})
+    assert int_attrs == {ENT_IOB: 1}
+
+    int_attrs = intify_attrs({"ENT_IOB": "O"})
+    assert int_attrs == {ENT_IOB: 2}
+
+    int_attrs = intify_attrs({"ENT_IOB": "B"})
+    assert int_attrs == {ENT_IOB: 3}
+
+    int_attrs = intify_attrs({ENT_IOB: ""})
+    assert int_attrs == {ENT_IOB: 0}
+
+    int_attrs = intify_attrs({ENT_IOB: "I"})
+    assert int_attrs == {ENT_IOB: 1}
+
+    int_attrs = intify_attrs({ENT_IOB: "O"})
+    assert int_attrs == {ENT_IOB: 2}
+
+    int_attrs = intify_attrs({ENT_IOB: "B"})
+    assert int_attrs == {ENT_IOB: 3}
+
+    with pytest.raises(ValueError):
+        int_attrs = intify_attrs({"ENT_IOB": "XX"})
+
+    with pytest.raises(ValueError):
+        int_attrs = intify_attrs({ENT_IOB: "XX"})
+
+
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
 def test_lex_attrs_is_punct(text, match):
    assert is_punct(text) == match
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
    assert len(matcher(doc)) == 0
+
+
+def test_matcher_ent_iob_key(en_vocab):
+    """Test that patterns with ent_iob works correctly."""
+    matcher = Matcher(en_vocab)
+    matcher.add("Rule", [[{"ENT_IOB": "I"}]])
+    doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
+    doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
+    doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
+    doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
+    matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
+    matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
+    assert len(matches1) == 1
+    assert matches1[0] == "York"
+    assert len(matches2) == 0
+
+    matcher = Matcher(en_vocab)  # Test iob pattern with operators
+    matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
+    doc = Doc(
+        en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
+    )
+    doc.ents = [Span(doc, 4, 7, label="PERSON")]
+    matches = [doc[start:end].text for _, start, end in matcher(doc)]
+    assert len(matches) == 3
+    assert matches[0] == "Maria"
+    assert matches[1] == "Maria Esperanza"
+    assert matches[2] == "Esperanza"
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@ -12,6 +12,7 @@ TEST_PATTERNS = [
    ([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
    ([{"_": "foo"}], 1, 1),
    ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
+    ([{"ENT_IOB": "foo"}], 1, 1),
    ([1, 2, 3], 3, 1),
    # Bad patterns flagged outside of Matcher
    ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0),  # prev: (1, 0)
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -79,7 +79,8 @@ def test_explicit_labels():
    nlp.initialize()
    assert spancat.labels == ("PERSON", "LOC")

-
+#TODO figure out why this is flaky
+@pytest.mark.skip(reason="Test is unreliable for unknown reason")
 def test_doc_gc():
    # If the Doc object is garbage collected, the spans won't be functional afterwards
    nlp = Language()
@ -97,6 +98,7 @@ def test_doc_gc():
        assert isinstance(spangroups, SpanGroups)
        for key, spangroup in spangroups.items():
            assert isinstance(spangroup, SpanGroup)
+            # XXX This fails with length 0 sometimes
            assert len(spangroup) > 0
            with pytest.raises(RuntimeError):
                span = spangroup[0]
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -12,6 +12,8 @@ from spacy.cli._util import is_subpath_of, load_project_config
 from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
+from spacy.cli.debug_data import _get_labels_from_model
+from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
 from spacy.cli.package import get_third_party_dependencies
@ -665,3 +667,28 @@ def test_get_third_party_dependencies():
 )
 def test_is_subpath_of(parent, child, expected):
    assert is_subpath_of(parent, child) == expected
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "factory_name,pipe_name",
+    [
+        ("ner", "ner"),
+        ("ner", "my_ner"),
+        ("spancat", "spancat"),
+        ("spancat", "my_spancat"),
+    ],
+)
+def test_get_labels_from_model(factory_name, pipe_name):
+    labels = ("A", "B")
+
+    nlp = English()
+    pipe = nlp.add_pipe(factory_name, name=pipe_name)
+    for label in labels:
+        pipe.add_label(label)
+    nlp.initialize()
+    assert nlp.get_pipe(pipe_name).labels == labels
+    if factory_name == "spancat":
+        assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
+    else:
+        assert _get_labels_from_model(nlp, factory_name) == set(labels)
--- a/spacy/tests/vocab_vectors/test_similarity.py
+++ b/spacy/tests/vocab_vectors/test_similarity.py
@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
    assert lex1.vector_norm != 0
    assert lex2.vector_norm != 0
    assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
+    assert isinstance(lex1.similarity(lex2), float)
    assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
    assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))

@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
    assert doc[0].vector_norm != 0
    assert doc[1].vector_norm != 0
    assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
+    assert isinstance(doc[0].similarity(doc[1]), float)
    assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
    assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))


+def test_vectors_similarity_SS(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc = Doc(vocab, words=[word1, word2])
+    assert isinstance(doc[0:1].similarity(doc[0:2]), float)
+    assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
+
+
+def test_vectors_similarity_DD(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc1 = Doc(vocab, words=[word1, word2])
+    doc2 = Doc(vocab, words=[word2, word1])
+    assert isinstance(doc1.similarity(doc2), float)
+    assert doc1.similarity(doc2) == doc2.similarity(doc1)
+
+
 def test_vectors_similarity_TD(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
    with pytest.warns(UserWarning):
+        assert isinstance(doc.similarity(doc[0]), float)
+        assert isinstance(doc[0].similarity(doc), float)
        assert doc.similarity(doc[0]) == doc[0].similarity(doc)


-def test_vectors_similarity_DS(vocab, vectors):
-    [(word1, vec1), (word2, vec2)] = vectors
-    doc = Doc(vocab, words=[word1, word2])
-    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
-
-
 def test_vectors_similarity_TS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = Doc(vocab, words=[word1, word2])
    with pytest.warns(UserWarning):
+        assert isinstance(doc[:2].similarity(doc[0]), float)
+        assert isinstance(doc[0].similarity(doc[-2]), float)
        assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
+
+
+def test_vectors_similarity_DS(vocab, vectors):
+    [(word1, vec1), (word2, vec2)] = vectors
+    doc = Doc(vocab, words=[word1, word2])
+    assert isinstance(doc.similarity(doc[:2]), float)
+    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@ -421,7 +421,7 @@ def test_vector_is_oov():
 def test_init_vectors_unset():
    v = Vectors(shape=(10, 10))
    assert v.is_full is False
-    assert v.data.shape == (10, 10)
+    assert v.shape == (10, 10)

    with pytest.raises(ValueError):
        v = Vectors(shape=(10, 10), mode="floret")
@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
    # rows: 2 rows per ngram
    rows = OPS.xp.asarray(
        [
-            h % nlp.vocab.vectors.data.shape[0]
+            h % nlp.vocab.vectors.shape[0]
            for ngram in ngrams
            for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
        ],
@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
    # an empty key returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab[""].vector),
-        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+        numpy.zeros((nlp.vocab.vectors.shape[0],)),
    )
    # an empty batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
-        numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
+        numpy.zeros((1, nlp.vocab.vectors.shape[0])),
    )
    # an empty key within a batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
-        numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
+        numpy.zeros((nlp.vocab.vectors.shape[0],)),
    )

    # the loaded ngram vector table cannot be modified
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -616,7 +616,7 @@ cdef class Doc:
        """
        if "has_vector" in self.user_hooks:
            return self.user_hooks["has_vector"](self)
-        elif self.vocab.vectors.data.size:
+        elif self.vocab.vectors.size:
            return True
        elif self.tensor.size:
            return True
@ -641,7 +641,7 @@ cdef class Doc:
            if not len(self):
                self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
                return self._vector
-            elif self.vocab.vectors.data.size > 0:
+            elif self.vocab.vectors.size > 0:
                self._vector = sum(t.vector for t in self) / len(self)
                return self._vector
            elif self.tensor.size > 0:
@ -1183,7 +1183,7 @@ cdef class Doc:
                token_offset = -1
                for doc in docs[:-1]:
                    token_offset += len(doc)
-                    if not (len(doc) > 0 and doc[-1].is_space):
+                    if len(doc) > 0 and not doc[-1].is_space:
                        concat_spaces[token_offset] = True

        concat_array = numpy.concatenate(arrays)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -352,7 +352,9 @@ cdef class Span:
            return 0.0
        vector = self.vector
        xp = get_array_module(vector)
-        return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
    
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy
@ -485,7 +487,7 @@ cdef class Span:
        """
        if "has_vector" in self.doc.user_span_hooks:
            return self.doc.user_span_hooks["has_vector"](self)
-        elif self.vocab.vectors.data.size > 0:
+        elif self.vocab.vectors.size > 0:
            return any(token.has_vector for token in self)
        elif self.doc.tensor.size > 0:
            return True
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads

 from .. import parts_of_speech
 from ..errors import Errors, Warnings
+from ..attrs import IOB_STRINGS
 from .underscore import Underscore, get_ext_args


@ -209,7 +210,9 @@ cdef class Token:
            return 0.0
        vector = self.vector
        xp = get_array_module(vector)
-        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
+        result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
+        # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
+        return result.item()
    
    def has_morph(self):
        """Check whether the token has annotated morph information.
@ -743,7 +746,7 @@ cdef class Token:

    @classmethod
    def iob_strings(cls):
-        return ("", "I", "O", "B")
+        return IOB_STRINGS

    @property
    def ent_iob_(self):
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -164,7 +164,7 @@ def load_vectors_into_model(
        len(vectors_nlp.vocab.vectors.keys()) == 0
        and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
    ) or (
-        vectors_nlp.vocab.vectors.data.shape[0] == 0
+        vectors_nlp.vocab.vectors.shape[0] == 0
        and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
    ):
        logger.warning(Warnings.W112.format(name=name))
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -10,7 +10,7 @@ from typing import cast
 import warnings
 from enum import Enum
 import srsly
-from thinc.api import get_array_module, get_current_ops
+from thinc.api import Ops, get_array_module, get_current_ops
 from thinc.backends import get_array_ops
 from thinc.types import Floats2d

@ -146,7 +146,7 @@ cdef class Vectors:

        DOCS: https://spacy.io/api/vectors#size
        """
-        return self.data.shape[0] * self.data.shape[1]
+        return self.data.size

    @property
    def is_full(self):
@ -517,6 +517,9 @@ cdef class Vectors:
                    for i in range(len(queries)) ], dtype="uint64")
        return (keys, best_rows, scores)

+    def to_ops(self, ops: Ops):
+        self.data = ops.asarray(self.data)
+
    def _get_cfg(self):
        if self.mode == Mode.default:
            return {
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -283,7 +283,7 @@ cdef class Vocab:

    @property
    def vectors_length(self):
-        return self.vectors.data.shape[1]
+        return self.vectors.shape[1]

    def reset_vectors(self, *, width=None, shape=None):
        """Drop the current vector table. Because all vectors must be the same
@ -294,7 +294,7 @@ cdef class Vocab:
        elif shape is not None:
            self.vectors = Vectors(strings=self.strings, shape=shape)
        else:
-            width = width if width is not None else self.vectors.data.shape[1]
+            width = width if width is not None else self.vectors.shape[1]
            self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))

    def prune_vectors(self, nr_row, batch_size=1024):
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@ -99,9 +99,9 @@ be a token pattern (list) or a phrase pattern (string). For example:
 ## EntityRuler.initialize {#initialize tag="method" new="3"}

 Initialize the component with data and used before training to load in rules
-from a file. This method is typically called by
-[`Language.initialize`](/api/language#initialize) and lets you customize
-arguments it receives via the
+from a [pattern file](/usage/rule-based-matching/#entityruler-files). This method
+is typically called by [`Language.initialize`](/api/language#initialize) and
+lets you customize arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
 config.

--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -44,6 +44,7 @@ rule-based matching are:
 | `SPACY`                                         | Token has a trailing space. ~~bool~~                                                                                      |
 |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
 | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `ENT_IOB`                                       | The IOB part of the token's entity tag. ~~str~~                                                                           |
 | `ENT_ID`                                        | The token's entity ID (`ent_id`). ~~str~~                                                                                 |
 | `ENT_KB_ID`                                     | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~                                                               |
 | `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
 | ------ | --------------------------------------- |
 | `keys` | The keys. ~~Iterable[Union[int, str]]~~ |

+## Vectors.to_ops {#to_ops tag="method"}
+
+Change the embedding matrix to use different Thinc ops.
+
+> #### Example
+>
+> ```python
+> from thinc.api import NumpyOps
+>
+> vectors.to_ops(NumpyOps())
+>
+> ```
+
+| Name  | Description                                              |
+|-------|----------------------------------------------------------|
+| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
+
 ## Vectors.to_disk {#to_disk tag="method"}

 Save the current state to a directory.
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -1479,7 +1479,7 @@ especially useful it you want to pass in a string instead of calling
 ### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3}

 This example shows the implementation of a pipeline component that fetches
-country meta data via the [REST Countries API](https://restcountries.eu), sets
+country meta data via the [REST Countries API](https://restcountries.com), sets
 entity annotations for countries and sets custom attributes on the `Doc` and
 `Span` – for example, the capital, latitude/longitude coordinates and even the
 country flag.
@ -1495,7 +1495,7 @@ from spacy.tokens import Doc, Span, Token
@Language.factory("rest_countries")
 class RESTCountriesComponent:
    def __init__(self, nlp, name, label="GPE"):
-        r = requests.get("https://restcountries.eu/rest/v2/all")
+        r = requests.get("https://restcountries.com/v2/all")
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()
        # Convert API response to dict keyed by country name for easy lookup
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1770,9 +1770,9 @@
            "title": "Applied Language Technology",
            "slogan": "NLP for newcomers using spaCy and Stanza",
            "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.",
-            "url": "https://applied-language-technology.readthedocs.io/",
+            "url": "https://applied-language-technology.mooc.fi",
            "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg",
-            "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png",
+            "thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png",
            "author": "Tuomo Hiippala",
            "author_links": {
                "twitter": "tuomo_h",
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -113,8 +113,7 @@ const QuickstartInstall = ({ id, title }) => {
                            {
                                id: 'venv',
                                title: 'virtual env',
-                                help:
-                                    'Use a virtual environment and install spaCy into a user directory',
+                                help: 'Use a virtual environment',
                            },
                            {
                                id: 'train',
@ -165,27 +164,51 @@ const QuickstartInstall = ({ id, title }) => {
                        setters={setters}
                        showDropdown={showDropdown}
                    >
-                        <QS config="venv">python -m venv .env</QS>
-                        <QS config="venv" os="mac">
+                        <QS package="pip" config="venv">
+                            python -m venv .env
+                        </QS>
+                        <QS package="pip" config="venv" os="mac">
                            source .env/bin/activate
                        </QS>
-                        <QS config="venv" os="linux">
+                        <QS package="pip" config="venv" os="linux">
                            source .env/bin/activate
                        </QS>
-                        <QS config="venv" os="windows">
+                        <QS package="pip" config="venv" os="windows">
                            .env\Scripts\activate
                        </QS>
+                        <QS package="source" config="venv">
+                            python -m venv .env
+                        </QS>
+                        <QS package="source" config="venv" os="mac">
+                            source .env/bin/activate
+                        </QS>
+                        <QS package="source" config="venv" os="linux">
+                            source .env/bin/activate
+                        </QS>
+                        <QS package="source" config="venv" os="windows">
+                            .env\Scripts\activate
+                        </QS>
+                        <QS package="conda" config="venv">
+                            conda create -n venv
+                        </QS>
+                        <QS package="conda" config="venv">
+                            conda activate venv
+                        </QS>
                        <QS package="pip">pip install -U pip setuptools wheel</QS>
                        <QS package="source">pip install -U pip setuptools wheel</QS>
                        <QS package="pip">
-                            pip install -U {pkg}
-                            {pipExtras && `[${pipExtras}]`}
+                            {pipExtras
+                                ? `pip install -U '${pkg}[${pipExtras}]'`
+                                : `pip install -U ${pkg}`}
                            {nightly ? ' --pre' : ''}
                        </QS>
                        <QS package="conda">conda install -c conda-forge spacy</QS>
                        <QS package="conda" hardware="gpu">
                            conda install -c conda-forge cupy
                        </QS>
+                        <QS package="conda" config="train">
+                            conda install -c conda-forge spacy-transformers
+                        </QS>
                        <QS package="source">
                            git clone https://github.com/{repo}
                            {nightly ? ` --branch ${DEFAULT_BRANCH}` : ''}
@ -205,9 +228,6 @@ const QuickstartInstall = ({ id, title }) => {
                        <QS config="train" package="conda" comment prompt={false}>
                            # packages only available via pip
                        </QS>
-                        <QS config="train" package="conda">
-                            pip install spacy-transformers
-                        </QS>
                        <QS config="train" package="conda">
                            pip install spacy-lookups-data
                        </QS>