diff --git a/README.md b/README.md index bec675b58..f9f484bae 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ valuable if it's shared publicly, so that more people can benefit from it. - Non-destructive **tokenization** - **Named entity** recognition -- Support for **49+ languages** +- Support for **50+ languages** - Pre-trained [statistical models](https://spacy.io/models) and word vectors - State-of-the-art speed - Easy **deep learning** integration diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 4fb22f3f0..8ef1fe123 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -70,15 +70,33 @@ def merge_sents(sents): return [(m_deps, m_brackets)] -def align(cand_words, gold_words): - if cand_words == gold_words: - alignment = numpy.arange(len(cand_words)) +def align(tokens_a, tokens_b): + """Calculate alignment tables between two tokenizations, using the Levenshtein + algorithm. The alignment is case-insensitive. + + tokens_a (List[str]): The candidate tokenization. + tokens_b (List[str]): The reference tokenization. + RETURNS: (tuple): A 5-tuple consisting of the following information: + * cost (int): The number of misaligned tokens. + * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`. + For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns + to `tokens_b[6]`. If there's no one-to-one alignment for a token, + it has the value -1. + * b2a (List[int]): The same as `a2b`, but mapping the other direction. + * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a` + to indices in `tokens_b`, where multiple tokens of `tokens_a` align to + the same token of `tokens_b`. + * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other + direction. + """ + if tokens_a == tokens_b: + alignment = numpy.arange(len(tokens_a)) return 0, alignment, alignment, {}, {} - cand_words = [w.replace(" ", "").lower() for w in cand_words] - gold_words = [w.replace(" ", "").lower() for w in gold_words] - cost, i2j, j2i, matrix = _align.align(cand_words, gold_words) - i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words], - [len(w) for w in gold_words]) + tokens_a = [w.replace(" ", "").lower() for w in tokens_a] + tokens_b = [w.replace(" ", "").lower() for w in tokens_b] + cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b) + i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a], + [len(w) for w in tokens_b]) for i, j in list(i2j_multi.items()): if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j: i2j[i] = j diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py index 29aa5370d..c0ee83e1b 100644 --- a/spacy/tests/regression/test_issue3611.py +++ b/spacy/tests/regression/test_issue3611.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import pytest import spacy from spacy.util import minibatch, compounding @@ -9,27 +8,25 @@ from spacy.util import minibatch, compounding def test_issue3611(): """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ unique_classes = ["offensive", "inoffensive"] - x_train = ["This is an offensive text", - "This is the second offensive text", - "inoff"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] y_train = ["offensive", "offensive", "inoffensive"] # preparing the data pos_cats = list() for train_instance in y_train: pos_cats.append({label: label == train_instance for label in unique_classes}) - train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats])) + train_data = list(zip(x_train, [{"cats": cats} for cats in pos_cats])) # set up the spacy model with a text categorizer component - nlp = spacy.blank('en') + nlp = spacy.blank("en") textcat = nlp.create_pipe( "textcat", - config={ - "exclusive_classes": True, - "architecture": "bow", - "ngram_size": 2 - } + config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, ) for label in unique_classes: @@ -37,7 +34,7 @@ def test_issue3611(): nlp.add_pipe(textcat, last=True) # training the network - other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() for i in range(3): @@ -46,6 +43,10 @@ def test_issue3611(): for batch in batches: texts, annotations = zip(*batch) - nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses) - - + nlp.update( + docs=texts, + golds=annotations, + sgd=optimizer, + drop=0.1, + losses=losses, + ) diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py index e3e0f25ee..d935db17f 100644 --- a/spacy/tests/regression/test_issue3625.py +++ b/spacy/tests/regression/test_issue3625.py @@ -3,8 +3,10 @@ from __future__ import unicode_literals from spacy.lang.hi import Hindi + def test_issue3625(): """Test that default punctuation rules applies to hindi unicode characters""" nlp = Hindi() - doc = nlp(u"hi. how हुए. होटल, होटल") - assert [token.text for token in doc] == ['hi', '.', 'how', 'हुए', '.', 'होटल', ',', 'होटल'] \ No newline at end of file + doc = nlp("hi. how हुए. होटल, होटल") + expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] + assert [token.text for token in doc] == expected diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py index 34d6bb46e..c24c60b6d 100644 --- a/spacy/tests/regression/test_issue3839.py +++ b/spacy/tests/regression/test_issue3839.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import pytest from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py index 42584b133..62e8eabd6 100644 --- a/spacy/tests/regression/test_issue3869.py +++ b/spacy/tests/regression/test_issue3869.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import pytest - from spacy.attrs import IS_ALPHA from spacy.lang.en import English @@ -10,11 +9,11 @@ from spacy.lang.en import English @pytest.mark.parametrize( "sentence", [ - 'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.', - 'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.', - 'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s number one', - 'Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.', - "It was a missed assignment, but it shouldn't have resulted in a turnover ..." + "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", + "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", + "It was a missed assignment, but it shouldn't have resulted in a turnover ...", ], ) def test_issue3869(sentence): @@ -27,5 +26,3 @@ def test_issue3869(sentence): count += token.is_alpha assert count == doc.count_by(IS_ALPHA).get(1, 0) - - diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py new file mode 100644 index 000000000..f9912c494 --- /dev/null +++ b/spacy/tests/regression/test_issue3951.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.matcher import Matcher +from spacy.tokens import Doc + + +@pytest.mark.xfail +def test_issue3951(en_vocab): + """Test that combinations of optional rules are matched correctly.""" + matcher = Matcher(en_vocab) + pattern = [ + {"LOWER": "hello"}, + {"LOWER": "this", "OP": "?"}, + {"OP": "?"}, + {"LOWER": "world"}, + ] + matcher.add("TEST", None, pattern) + doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) + matches = matcher(doc) + assert len(matches) == 0 diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py new file mode 100644 index 000000000..e82dff269 --- /dev/null +++ b/spacy/tests/regression/test_issue3972.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.matcher import PhraseMatcher +from spacy.tokens import Doc + + +@pytest.mark.xfail +def test_issue3972(en_vocab): + """Test that the PhraseMatcher returns duplicates for duplicate match IDs. + """ + matcher = PhraseMatcher(en_vocab) + matcher.add("A", None, Doc(en_vocab, words=["New", "York"])) + matcher.add("B", None, Doc(en_vocab, words=["New", "York"])) + doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) + matches = matcher(doc) + assert len(matches) == 2 diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 13f68a85d..2419a8e55 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -76,6 +76,50 @@ Convert a list of Doc objects into the | `id` | int | ID to assign to the JSON. Defaults to `0`. | | **RETURNS** | list | The data in spaCy's JSON format. | +### gold.align {#align tag="function"} + +Calculate alignment tables between two tokenizations, using the Levenshtein +algorithm. The alignment is case-insensitive. + +> #### Example +> +> ```python +> from spacy.gold import align +> +> bert_tokens = ["obama", "'", "s", "podcast"] +> spacy_tokens = ["obama", "'s", "podcast"] +> alignment = align(bert_tokens, spacy_tokens) +> cost, a2b, b2a, a2b_multi, b2a_multi = alignment +> ``` + +| Name | Type | Description | +| ----------- | ----- | -------------------------------------------------------------------------- | +| `tokens_a` | list | String values of candidate tokens to align. | +| `tokens_b` | list | String values of reference tokens to align. | +| **RETURNS** | tuple | A `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the alignment. | + +The returned tuple contains the following alignment information: + +> #### Example +> +> ```python +> a2b = array([0, -1, -1, 2]) +> b2a = array([0, 2, 3]) +> a2b_multi = {1: 1, 2: 1} +> b2a_multi = {} +> ``` +> +> If `a2b[3] == 2`, that means that `tokens_a[3]` aligns to `tokens_b[2]`. If +> there's no one-to-one alignment for a token, it has the value `-1`. + +| Name | Type | Description | +| ----------- | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `cost` | int | The number of misaligned tokens. | +| `a2b` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_a` to indices in `tokens_b`. | +| `b2a` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_b` to indices in `tokens_a`. | +| `a2b_multi` | dict | A dictionary mapping indices in `tokens_a` to indices in `tokens_b`, where multiple tokens of `tokens_a` align to the same token of `tokens_b`. | +| `b2a_multi` | dict | A dictionary mapping indices in `tokens_b` to indices in `tokens_a`, where multiple tokens of `tokens_b` align to the same token of `tokens_a`. | + ### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} Encode labelled spans into per-token tags, using the diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 538a9f205..2ef30576e 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -963,6 +963,71 @@ Once you have a [`Doc`](/api/doc) object, you can write to its attributes to set the part-of-speech tags, syntactic dependencies, named entities and other attributes. For details, see the respective usage pages. +### Aligning tokenization {#aligning-tokenization} + +spaCy's tokenization is non-destructive and uses language-specific rules +optimized for compatibility with treebank annotations. Other tools and resources +can sometimes tokenize things differently – for example, `"I'm"` → +`["I", "'", "m"]` instead of `["I", "'m"]`. + +In cases like that, you often want to align the tokenization so that you can +merge annotations from different sources together, or take vectors predicted by +a [pre-trained BERT model](https://github.com/huggingface/pytorch-transformers) +and apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align) +helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the +number of misaligned tokens, the one-to-one mappings of token indices in both +directions and the indices where multiple tokens align to one single token. + +> #### ✏️ Things to try +> +> 1. Change the capitalization in one of the token lists – for example, +> `"obama"` to `"Obama"`. You'll see that the alignment is case-insensitive. +> 2. Change `"podcasts"` in `other_tokens` to `"pod", "casts"`. You should see +> that there are now 4 misaligned tokens and that the new many-to-one mapping +> is reflected in `a2b_multi`. +> 3. Make `other_tokens` and `spacy_tokens` identical. You'll see that the +> `cost` is `0` and all corresponding mappings are also identical. + +```python +### {executable="true"} +from spacy.gold import align + +other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] +spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] +cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens) +print("Misaligned tokens:", cost) # 2 +print("One-to-one mappings a -> b", a2b) # array([0, 1, 2, 3, -1, -1, 5, 6]) +print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, 5, 6, 7]) +print("Many-to-one mappings a -> b", a2b_multi) # {4: 4, 5: 4} +print("Many-to-one mappings b-> a", b2a_multi) # {} +``` + +Here are some insights from the alignment information generated in the example +above: + +- Two tokens are misaligned. +- The one-to-one mappings for the first four tokens are identical, which means + they map to each other. This makes sense because they're also identical in the + input: `"i"`, `"listened"`, `"to"` and `"obama"`. +- The index mapped to `a2b[6]` is `5`, which means that `other_tokens[6]` + (`"podcasts"`) aligns to `spacy_tokens[6]` (also `"podcasts"`). +- `a2b[4]` is `-1`, which means that there is no one-to-one alignment for the + token at `other_tokens[5]`. The token `"'"` doesn't exist on its own in + `spacy_tokens`. The same goes for `a2b[5]` and `other_tokens[5]`, i.e. `"s"`. +- The dictionary `a2b_multi` shows that both tokens 4 and 5 of `other_tokens` + (`"'"` and `"s"`) align to token 4 of `spacy_tokens` (`"'s"`). +- The dictionary `b2a_multi` shows that there are no tokens in `spacy_tokens` + that map to multiple tokens in `other_tokens`. + + + +The current implementation of the alignment algorithm assumes that both +tokenizations add up to the same string. For example, you'll be able to align +`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not +`["I", "'m"]` and `["I", "am"]`. + + + ## Merging and splitting {#retokenization new="2.1"} The [`Doc.retokenize`](/api/doc#retokenize) context manager lets you merge and