mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-10 09:16:31 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
4c863aeb06
|
@ -67,7 +67,7 @@ valuable if it's shared publicly, so that more people can benefit from it.
|
||||||
|
|
||||||
- Non-destructive **tokenization**
|
- Non-destructive **tokenization**
|
||||||
- **Named entity** recognition
|
- **Named entity** recognition
|
||||||
- Support for **49+ languages**
|
- Support for **50+ languages**
|
||||||
- Pre-trained [statistical models](https://spacy.io/models) and word vectors
|
- Pre-trained [statistical models](https://spacy.io/models) and word vectors
|
||||||
- State-of-the-art speed
|
- State-of-the-art speed
|
||||||
- Easy **deep learning** integration
|
- Easy **deep learning** integration
|
||||||
|
|
|
@ -70,15 +70,33 @@ def merge_sents(sents):
|
||||||
return [(m_deps, m_brackets)]
|
return [(m_deps, m_brackets)]
|
||||||
|
|
||||||
|
|
||||||
def align(cand_words, gold_words):
|
def align(tokens_a, tokens_b):
|
||||||
if cand_words == gold_words:
|
"""Calculate alignment tables between two tokenizations, using the Levenshtein
|
||||||
alignment = numpy.arange(len(cand_words))
|
algorithm. The alignment is case-insensitive.
|
||||||
|
|
||||||
|
tokens_a (List[str]): The candidate tokenization.
|
||||||
|
tokens_b (List[str]): The reference tokenization.
|
||||||
|
RETURNS: (tuple): A 5-tuple consisting of the following information:
|
||||||
|
* cost (int): The number of misaligned tokens.
|
||||||
|
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
|
||||||
|
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
|
||||||
|
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
|
||||||
|
it has the value -1.
|
||||||
|
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
|
||||||
|
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
|
||||||
|
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
|
||||||
|
the same token of `tokens_b`.
|
||||||
|
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
||||||
|
direction.
|
||||||
|
"""
|
||||||
|
if tokens_a == tokens_b:
|
||||||
|
alignment = numpy.arange(len(tokens_a))
|
||||||
return 0, alignment, alignment, {}, {}
|
return 0, alignment, alignment, {}, {}
|
||||||
cand_words = [w.replace(" ", "").lower() for w in cand_words]
|
tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
|
||||||
gold_words = [w.replace(" ", "").lower() for w in gold_words]
|
tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
|
||||||
cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
|
cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
|
||||||
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words],
|
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
|
||||||
[len(w) for w in gold_words])
|
[len(w) for w in tokens_b])
|
||||||
for i, j in list(i2j_multi.items()):
|
for i, j in list(i2j_multi.items()):
|
||||||
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
|
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
|
||||||
i2j[i] = j
|
i2j[i] = j
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
@ -9,27 +8,25 @@ from spacy.util import minibatch, compounding
|
||||||
def test_issue3611():
|
def test_issue3611():
|
||||||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||||
unique_classes = ["offensive", "inoffensive"]
|
unique_classes = ["offensive", "inoffensive"]
|
||||||
x_train = ["This is an offensive text",
|
x_train = [
|
||||||
|
"This is an offensive text",
|
||||||
"This is the second offensive text",
|
"This is the second offensive text",
|
||||||
"inoff"]
|
"inoff",
|
||||||
|
]
|
||||||
y_train = ["offensive", "offensive", "inoffensive"]
|
y_train = ["offensive", "offensive", "inoffensive"]
|
||||||
|
|
||||||
# preparing the data
|
# preparing the data
|
||||||
pos_cats = list()
|
pos_cats = list()
|
||||||
for train_instance in y_train:
|
for train_instance in y_train:
|
||||||
pos_cats.append({label: label == train_instance for label in unique_classes})
|
pos_cats.append({label: label == train_instance for label in unique_classes})
|
||||||
train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats]))
|
train_data = list(zip(x_train, [{"cats": cats} for cats in pos_cats]))
|
||||||
|
|
||||||
# set up the spacy model with a text categorizer component
|
# set up the spacy model with a text categorizer component
|
||||||
nlp = spacy.blank('en')
|
nlp = spacy.blank("en")
|
||||||
|
|
||||||
textcat = nlp.create_pipe(
|
textcat = nlp.create_pipe(
|
||||||
"textcat",
|
"textcat",
|
||||||
config={
|
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||||
"exclusive_classes": True,
|
|
||||||
"architecture": "bow",
|
|
||||||
"ngram_size": 2
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
for label in unique_classes:
|
for label in unique_classes:
|
||||||
|
@ -37,7 +34,7 @@ def test_issue3611():
|
||||||
nlp.add_pipe(textcat, last=True)
|
nlp.add_pipe(textcat, last=True)
|
||||||
|
|
||||||
# training the network
|
# training the network
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
|
||||||
with nlp.disable_pipes(*other_pipes):
|
with nlp.disable_pipes(*other_pipes):
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
|
@ -46,6 +43,10 @@ def test_issue3611():
|
||||||
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
texts, annotations = zip(*batch)
|
||||||
nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses)
|
nlp.update(
|
||||||
|
docs=texts,
|
||||||
|
golds=annotations,
|
||||||
|
sgd=optimizer,
|
||||||
|
drop=0.1,
|
||||||
|
losses=losses,
|
||||||
|
)
|
||||||
|
|
|
@ -3,8 +3,10 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.lang.hi import Hindi
|
from spacy.lang.hi import Hindi
|
||||||
|
|
||||||
|
|
||||||
def test_issue3625():
|
def test_issue3625():
|
||||||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||||||
nlp = Hindi()
|
nlp = Hindi()
|
||||||
doc = nlp(u"hi. how हुए. होटल, होटल")
|
doc = nlp("hi. how हुए. होटल, होटल")
|
||||||
assert [token.text for token in doc] == ['hi', '.', 'how', 'हुए', '.', 'होटल', ',', 'होटल']
|
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||||||
|
assert [token.text for token in doc] == expected
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.attrs import IS_ALPHA
|
from spacy.attrs import IS_ALPHA
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
@ -10,11 +9,11 @@ from spacy.lang.en import English
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"sentence",
|
"sentence",
|
||||||
[
|
[
|
||||||
'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.',
|
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||||||
'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.',
|
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||||||
'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s number one',
|
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||||||
'Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.',
|
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||||||
"It was a missed assignment, but it shouldn't have resulted in a turnover ..."
|
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_issue3869(sentence):
|
def test_issue3869(sentence):
|
||||||
|
@ -27,5 +26,3 @@ def test_issue3869(sentence):
|
||||||
count += token.is_alpha
|
count += token.is_alpha
|
||||||
|
|
||||||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
||||||
|
|
||||||
|
|
||||||
|
|
22
spacy/tests/regression/test_issue3951.py
Normal file
22
spacy/tests/regression/test_issue3951.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_issue3951(en_vocab):
|
||||||
|
"""Test that combinations of optional rules are matched correctly."""
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [
|
||||||
|
{"LOWER": "hello"},
|
||||||
|
{"LOWER": "this", "OP": "?"},
|
||||||
|
{"OP": "?"},
|
||||||
|
{"LOWER": "world"},
|
||||||
|
]
|
||||||
|
matcher.add("TEST", None, pattern)
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 0
|
18
spacy/tests/regression/test_issue3972.py
Normal file
18
spacy/tests/regression/test_issue3972.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from spacy.matcher import PhraseMatcher
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_issue3972(en_vocab):
|
||||||
|
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||||
|
"""
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("A", None, Doc(en_vocab, words=["New", "York"]))
|
||||||
|
matcher.add("B", None, Doc(en_vocab, words=["New", "York"]))
|
||||||
|
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 2
|
|
@ -76,6 +76,50 @@ Convert a list of Doc objects into the
|
||||||
| `id` | int | ID to assign to the JSON. Defaults to `0`. |
|
| `id` | int | ID to assign to the JSON. Defaults to `0`. |
|
||||||
| **RETURNS** | list | The data in spaCy's JSON format. |
|
| **RETURNS** | list | The data in spaCy's JSON format. |
|
||||||
|
|
||||||
|
### gold.align {#align tag="function"}
|
||||||
|
|
||||||
|
Calculate alignment tables between two tokenizations, using the Levenshtein
|
||||||
|
algorithm. The alignment is case-insensitive.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.gold import align
|
||||||
|
>
|
||||||
|
> bert_tokens = ["obama", "'", "s", "podcast"]
|
||||||
|
> spacy_tokens = ["obama", "'s", "podcast"]
|
||||||
|
> alignment = align(bert_tokens, spacy_tokens)
|
||||||
|
> cost, a2b, b2a, a2b_multi, b2a_multi = alignment
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | ----- | -------------------------------------------------------------------------- |
|
||||||
|
| `tokens_a` | list | String values of candidate tokens to align. |
|
||||||
|
| `tokens_b` | list | String values of reference tokens to align. |
|
||||||
|
| **RETURNS** | tuple | A `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the alignment. |
|
||||||
|
|
||||||
|
The returned tuple contains the following alignment information:
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> a2b = array([0, -1, -1, 2])
|
||||||
|
> b2a = array([0, 2, 3])
|
||||||
|
> a2b_multi = {1: 1, 2: 1}
|
||||||
|
> b2a_multi = {}
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> If `a2b[3] == 2`, that means that `tokens_a[3]` aligns to `tokens_b[2]`. If
|
||||||
|
> there's no one-to-one alignment for a token, it has the value `-1`.
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| ----------- | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `cost` | int | The number of misaligned tokens. |
|
||||||
|
| `a2b` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_a` to indices in `tokens_b`. |
|
||||||
|
| `b2a` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_b` to indices in `tokens_a`. |
|
||||||
|
| `a2b_multi` | dict | A dictionary mapping indices in `tokens_a` to indices in `tokens_b`, where multiple tokens of `tokens_a` align to the same token of `tokens_b`. |
|
||||||
|
| `b2a_multi` | dict | A dictionary mapping indices in `tokens_b` to indices in `tokens_a`, where multiple tokens of `tokens_b` align to the same token of `tokens_a`. |
|
||||||
|
|
||||||
### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
|
### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
|
||||||
|
|
||||||
Encode labelled spans into per-token tags, using the
|
Encode labelled spans into per-token tags, using the
|
||||||
|
|
|
@ -963,6 +963,71 @@ Once you have a [`Doc`](/api/doc) object, you can write to its attributes to set
|
||||||
the part-of-speech tags, syntactic dependencies, named entities and other
|
the part-of-speech tags, syntactic dependencies, named entities and other
|
||||||
attributes. For details, see the respective usage pages.
|
attributes. For details, see the respective usage pages.
|
||||||
|
|
||||||
|
### Aligning tokenization {#aligning-tokenization}
|
||||||
|
|
||||||
|
spaCy's tokenization is non-destructive and uses language-specific rules
|
||||||
|
optimized for compatibility with treebank annotations. Other tools and resources
|
||||||
|
can sometimes tokenize things differently – for example, `"I'm"` →
|
||||||
|
`["I", "'", "m"]` instead of `["I", "'m"]`.
|
||||||
|
|
||||||
|
In cases like that, you often want to align the tokenization so that you can
|
||||||
|
merge annotations from different sources together, or take vectors predicted by
|
||||||
|
a [pre-trained BERT model](https://github.com/huggingface/pytorch-transformers)
|
||||||
|
and apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align)
|
||||||
|
helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the
|
||||||
|
number of misaligned tokens, the one-to-one mappings of token indices in both
|
||||||
|
directions and the indices where multiple tokens align to one single token.
|
||||||
|
|
||||||
|
> #### ✏️ Things to try
|
||||||
|
>
|
||||||
|
> 1. Change the capitalization in one of the token lists – for example,
|
||||||
|
> `"obama"` to `"Obama"`. You'll see that the alignment is case-insensitive.
|
||||||
|
> 2. Change `"podcasts"` in `other_tokens` to `"pod", "casts"`. You should see
|
||||||
|
> that there are now 4 misaligned tokens and that the new many-to-one mapping
|
||||||
|
> is reflected in `a2b_multi`.
|
||||||
|
> 3. Make `other_tokens` and `spacy_tokens` identical. You'll see that the
|
||||||
|
> `cost` is `0` and all corresponding mappings are also identical.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### {executable="true"}
|
||||||
|
from spacy.gold import align
|
||||||
|
|
||||||
|
other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
|
||||||
|
spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
|
||||||
|
cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens)
|
||||||
|
print("Misaligned tokens:", cost) # 2
|
||||||
|
print("One-to-one mappings a -> b", a2b) # array([0, 1, 2, 3, -1, -1, 5, 6])
|
||||||
|
print("One-to-one mappings b -> a", b2a) # array([0, 1, 2, 3, 5, 6, 7])
|
||||||
|
print("Many-to-one mappings a -> b", a2b_multi) # {4: 4, 5: 4}
|
||||||
|
print("Many-to-one mappings b-> a", b2a_multi) # {}
|
||||||
|
```
|
||||||
|
|
||||||
|
Here are some insights from the alignment information generated in the example
|
||||||
|
above:
|
||||||
|
|
||||||
|
- Two tokens are misaligned.
|
||||||
|
- The one-to-one mappings for the first four tokens are identical, which means
|
||||||
|
they map to each other. This makes sense because they're also identical in the
|
||||||
|
input: `"i"`, `"listened"`, `"to"` and `"obama"`.
|
||||||
|
- The index mapped to `a2b[6]` is `5`, which means that `other_tokens[6]`
|
||||||
|
(`"podcasts"`) aligns to `spacy_tokens[6]` (also `"podcasts"`).
|
||||||
|
- `a2b[4]` is `-1`, which means that there is no one-to-one alignment for the
|
||||||
|
token at `other_tokens[5]`. The token `"'"` doesn't exist on its own in
|
||||||
|
`spacy_tokens`. The same goes for `a2b[5]` and `other_tokens[5]`, i.e. `"s"`.
|
||||||
|
- The dictionary `a2b_multi` shows that both tokens 4 and 5 of `other_tokens`
|
||||||
|
(`"'"` and `"s"`) align to token 4 of `spacy_tokens` (`"'s"`).
|
||||||
|
- The dictionary `b2a_multi` shows that there are no tokens in `spacy_tokens`
|
||||||
|
that map to multiple tokens in `other_tokens`.
|
||||||
|
|
||||||
|
<Infobox title="Important note" variant="warning">
|
||||||
|
|
||||||
|
The current implementation of the alignment algorithm assumes that both
|
||||||
|
tokenizations add up to the same string. For example, you'll be able to align
|
||||||
|
`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not
|
||||||
|
`["I", "'m"]` and `["I", "am"]`.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## Merging and splitting {#retokenization new="2.1"}
|
## Merging and splitting {#retokenization new="2.1"}
|
||||||
|
|
||||||
The [`Doc.retokenize`](/api/doc#retokenize) context manager lets you merge and
|
The [`Doc.retokenize`](/api/doc#retokenize) context manager lets you merge and
|
||||||
|
|
Loading…
Reference in New Issue
Block a user