Merge branch 'master' into spacy.io

2025-07-15 02:32:37 +03:00 · 2019-07-17 15:35:34 +02:00 · 2019-07-17 15:35:34 +02:00 · 4c863aeb06
commit 4c863aeb06
parent 7784a20ef2 57d7076a72
10 changed files with 201 additions and 35 deletions
--- a/README.md
+++ b/README.md
@ -67,7 +67,7 @@ valuable if it's shared publicly, so that more people can benefit from it.

 -   Non-destructive **tokenization**
 -   **Named entity** recognition
-   Support for **49+ languages**
+-   Support for **50+ languages**
 -   Pre-trained [statistical models](https://spacy.io/models) and word vectors
 -   State-of-the-art speed
 -   Easy **deep learning** integration
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -70,15 +70,33 @@ def merge_sents(sents):
    return [(m_deps, m_brackets)]


-def align(cand_words, gold_words):
-    if cand_words == gold_words:
-        alignment = numpy.arange(len(cand_words))
+def align(tokens_a, tokens_b):
+    """Calculate alignment tables between two tokenizations, using the Levenshtein
+    algorithm. The alignment is case-insensitive.
+
+    tokens_a (List[str]): The candidate tokenization.
+    tokens_b (List[str]): The reference tokenization.
+    RETURNS: (tuple): A 5-tuple consisting of the following information:
+      * cost (int): The number of misaligned tokens.
+      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
+        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
+        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
+        it has the value -1.
+      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
+      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
+        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
+        the same token of `tokens_b`.
+      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
+            direction.
+    """
+    if tokens_a == tokens_b:
+        alignment = numpy.arange(len(tokens_a))
        return 0, alignment, alignment, {}, {}
-    cand_words = [w.replace(" ", "").lower() for w in cand_words]
-    gold_words = [w.replace(" ", "").lower() for w in gold_words]
-    cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
-    i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words],
-                                [len(w) for w in gold_words])
+    tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
+    tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
+    cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
+    i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
+                                                        [len(w) for w in tokens_b])
    for i, j in list(i2j_multi.items()):
        if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
            i2j[i] = j
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import pytest
 import spacy
 from spacy.util import minibatch, compounding

@ -9,27 +8,25 @@ from spacy.util import minibatch, compounding
 def test_issue3611():
    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
    unique_classes = ["offensive", "inoffensive"]
-    x_train = ["This is an offensive text",
-               "This is the second offensive text",
-               "inoff"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
    y_train = ["offensive", "offensive", "inoffensive"]

    # preparing the data
    pos_cats = list()
    for train_instance in y_train:
        pos_cats.append({label: label == train_instance for label in unique_classes})
-    train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats]))
+    train_data = list(zip(x_train, [{"cats": cats} for cats in pos_cats]))

    # set up the spacy model with a text categorizer component
-    nlp = spacy.blank('en')
+    nlp = spacy.blank("en")

    textcat = nlp.create_pipe(
        "textcat",
-        config={
-            "exclusive_classes": True,
-            "architecture": "bow",
-            "ngram_size": 2
-        }
+        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
    )

    for label in unique_classes:
@ -37,7 +34,7 @@ def test_issue3611():
    nlp.add_pipe(textcat, last=True)

    # training the network
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for i in range(3):
@ -46,6 +43,10 @@ def test_issue3611():

            for batch in batches:
                texts, annotations = zip(*batch)
-                nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses)
-
-
+                nlp.update(
+                    docs=texts,
+                    golds=annotations,
+                    sgd=optimizer,
+                    drop=0.1,
+                    losses=losses,
+                )
--- a/spacy/tests/regression/test_issue3625.py
+++ b/spacy/tests/regression/test_issue3625.py
@ -3,8 +3,10 @@ from __future__ import unicode_literals

 from spacy.lang.hi import Hindi

+
 def test_issue3625():
    """Test that default punctuation rules applies to hindi unicode characters"""
    nlp = Hindi()
-    doc = nlp(u"hi. how हुए. होटल, होटल")
-    assert [token.text for token in doc] == ['hi', '.', 'how', 'हुए', '.', 'होटल', ',', 'होटल']
+    doc = nlp("hi. how हुए. होटल, होटल")
+    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
+    assert [token.text for token in doc] == expected
--- a/spacy/tests/regression/test_issue3839.py
+++ b/spacy/tests/regression/test_issue3839.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import pytest
 from spacy.matcher import Matcher
 from spacy.tokens import Doc

--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@ -2,7 +2,6 @@
 from __future__ import unicode_literals

 import pytest
-
 from spacy.attrs import IS_ALPHA
 from spacy.lang.en import English

@ -10,11 +9,11 @@ from spacy.lang.en import English
@pytest.mark.parametrize(
    "sentence",
    [
-        'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.',
-        'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.',
-        'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s number one',
-        'Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.',
-        "It was a missed assignment, but it shouldn't have resulted in a turnover ..."
+        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
+        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
+        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
    ],
 )
 def test_issue3869(sentence):
@ -27,5 +26,3 @@ def test_issue3869(sentence):
        count += token.is_alpha

    assert count == doc.count_by(IS_ALPHA).get(1, 0)
-
-
--- a/spacy/tests/regression/test_issue3951.py
+++ b/spacy/tests/regression/test_issue3951.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.matcher import Matcher
+from spacy.tokens import Doc
+
+
+@pytest.mark.xfail
+def test_issue3951(en_vocab):
+    """Test that combinations of optional rules are matched correctly."""
+    matcher = Matcher(en_vocab)
+    pattern = [
+        {"LOWER": "hello"},
+        {"LOWER": "this", "OP": "?"},
+        {"OP": "?"},
+        {"LOWER": "world"},
+    ]
+    matcher.add("TEST", None, pattern)
+    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
+    matches = matcher(doc)
+    assert len(matches) == 0
--- a/spacy/tests/regression/test_issue3972.py
+++ b/spacy/tests/regression/test_issue3972.py
@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.matcher import PhraseMatcher
+from spacy.tokens import Doc
+
+
+@pytest.mark.xfail
+def test_issue3972(en_vocab):
+    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
+    """
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", None, Doc(en_vocab, words=["New", "York"]))
+    matcher.add("B", None, Doc(en_vocab, words=["New", "York"]))
+    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
+    matches = matcher(doc)
+    assert len(matches) == 2
--- a/website/docs/api/goldparse.md
+++ b/website/docs/api/goldparse.md
@ -76,6 +76,50 @@ Convert a list of Doc objects into the
 | `id`        | int              | ID to assign to the JSON. Defaults to `0`. |
 | **RETURNS** | list             | The data in spaCy's JSON format.           |

+### gold.align {#align tag="function"}
+
+Calculate alignment tables between two tokenizations, using the Levenshtein
+algorithm. The alignment is case-insensitive.
+
+> #### Example
+>
+> ```python
+> from spacy.gold import align
+>
+> bert_tokens = ["obama", "'", "s", "podcast"]
+> spacy_tokens = ["obama", "'s", "podcast"]
+> alignment = align(bert_tokens, spacy_tokens)
+> cost, a2b, b2a, a2b_multi, b2a_multi = alignment
+> ```
+
+| Name        | Type  | Description                                                                |
+| ----------- | ----- | -------------------------------------------------------------------------- |
+| `tokens_a`  | list  | String values of candidate tokens to align.                                |
+| `tokens_b`  | list  | String values of reference tokens to align.                                |
+| **RETURNS** | tuple | A `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the alignment. |
+
+The returned tuple contains the following alignment information:
+
+> #### Example
+>
+> ```python
+> a2b = array([0, -1, -1, 2])
+> b2a = array([0, 2, 3])
+> a2b_multi = {1: 1, 2: 1}
+> b2a_multi = {}
+> ```
+>
+> If `a2b[3] == 2`, that means that `tokens_a[3]` aligns to `tokens_b[2]`. If
+> there's no one-to-one alignment for a token, it has the value `-1`.
+
+| Name        | Type                                   | Description                                                                                                                                     |
+| ----------- | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
+| `cost`      | int                                    | The number of misaligned tokens.                                                                                                                |
+| `a2b`       | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_a` to indices in `tokens_b`.                                                                          |
+| `b2a`       | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_b` to indices in `tokens_a`.                                                                          |
+| `a2b_multi` | dict                                   | A dictionary mapping indices in `tokens_a` to indices in `tokens_b`, where multiple tokens of `tokens_a` align to the same token of `tokens_b`. |
+| `b2a_multi` | dict                                   | A dictionary mapping indices in `tokens_b` to indices in `tokens_a`, where multiple tokens of `tokens_b` align to the same token of `tokens_a`. |
+
 ### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}

 Encode labelled spans into per-token tags, using the
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -963,6 +963,71 @@ Once you have a [`Doc`](/api/doc) object, you can write to its attributes to set
 the part-of-speech tags, syntactic dependencies, named entities and other
 attributes. For details, see the respective usage pages.

+### Aligning tokenization {#aligning-tokenization}
+
+spaCy's tokenization is non-destructive and uses language-specific rules
+optimized for compatibility with treebank annotations. Other tools and resources
+can sometimes tokenize things differently – for example, `"I'm"` →
+`["I", "'", "m"]` instead of `["I", "'m"]`.
+
+In cases like that, you often want to align the tokenization so that you can
+merge annotations from different sources together, or take vectors predicted by
+a [pre-trained BERT model](https://github.com/huggingface/pytorch-transformers)
+and apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align)
+helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the
+number of misaligned tokens, the one-to-one mappings of token indices in both
+directions and the indices where multiple tokens align to one single token.
+
+> #### ✏️ Things to try
+>
+> 1. Change the capitalization in one of the token lists – for example,
+>    `"obama"` to `"Obama"`. You'll see that the alignment is case-insensitive.
+> 2. Change `"podcasts"` in `other_tokens` to `"pod", "casts"`. You should see
+>    that there are now 4 misaligned tokens and that the new many-to-one mapping
+>    is reflected in `a2b_multi`.
+> 3. Make `other_tokens` and `spacy_tokens` identical. You'll see that the
+>    `cost` is `0` and all corresponding mappings are also identical.
+
+```python
+### {executable="true"}
+from spacy.gold import align
+
+other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
+spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
+cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens)
+print("Misaligned tokens:", cost)  # 2
+print("One-to-one mappings a -> b", a2b)  # array([0, 1, 2, 3, -1, -1, 5, 6])
+print("One-to-one mappings b -> a", b2a)  # array([0, 1, 2, 3, 5, 6, 7])
+print("Many-to-one mappings a -> b", a2b_multi)  # {4: 4, 5: 4}
+print("Many-to-one mappings b-> a", b2a_multi)  # {}
+```
+
+Here are some insights from the alignment information generated in the example
+above:
+
+- Two tokens are misaligned.
+- The one-to-one mappings for the first four tokens are identical, which means
+  they map to each other. This makes sense because they're also identical in the
+  input: `"i"`, `"listened"`, `"to"` and `"obama"`.
+- The index mapped to `a2b[6]` is `5`, which means that `other_tokens[6]`
+  (`"podcasts"`) aligns to `spacy_tokens[6]` (also `"podcasts"`).
+- `a2b[4]` is `-1`, which means that there is no one-to-one alignment for the
+  token at `other_tokens[5]`. The token `"'"` doesn't exist on its own in
+  `spacy_tokens`. The same goes for `a2b[5]` and `other_tokens[5]`, i.e. `"s"`.
+- The dictionary `a2b_multi` shows that both tokens 4 and 5 of `other_tokens`
+  (`"'"` and `"s"`) align to token 4 of `spacy_tokens` (`"'s"`).
+- The dictionary `b2a_multi` shows that there are no tokens in `spacy_tokens`
+  that map to multiple tokens in `other_tokens`.
+
+<Infobox title="Important note" variant="warning">
+
+The current implementation of the alignment algorithm assumes that both
+tokenizations add up to the same string. For example, you'll be able to align
+`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not
+`["I", "'m"]` and `["I", "am"]`.
+
+</Infobox>
+
 ## Merging and splitting {#retokenization new="2.1"}

 The [`Doc.retokenize`](/api/doc#retokenize) context manager lets you merge and