mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 20:28:20 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
2f223a5dd8
|
@ -206,6 +206,9 @@ def debug_data(
|
||||||
missing_values, "value" if missing_values == 1 else "values"
|
missing_values, "value" if missing_values == 1 else "values"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
for label in new_labels:
|
||||||
|
if len(label) == 0:
|
||||||
|
msg.fail("Empty label found in new labels")
|
||||||
if new_labels:
|
if new_labels:
|
||||||
labels_with_counts = [
|
labels_with_counts = [
|
||||||
(label, count)
|
(label, count)
|
||||||
|
|
|
@ -503,6 +503,7 @@ class Errors(object):
|
||||||
"names: {names}")
|
"names: {names}")
|
||||||
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
||||||
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
||||||
|
E177 = ("Ill-formed IOB input detected: {tag}")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -522,6 +522,8 @@ def _consume_ent(tags):
|
||||||
tags.pop(0)
|
tags.pop(0)
|
||||||
label = tag[2:]
|
label = tag[2:]
|
||||||
if length == 1:
|
if length == 1:
|
||||||
|
if len(label) == 0:
|
||||||
|
raise ValueError(Errors.E177.format(tag=tag))
|
||||||
return ["U-" + label]
|
return ["U-" + label]
|
||||||
else:
|
else:
|
||||||
start = "B-" + label
|
start = "B-" + label
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
from spacy.gold import spans_from_biluo_tags, GoldParse
|
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
|
||||||
from spacy.gold import GoldCorpus, docs_to_json
|
from spacy.gold import GoldCorpus, docs_to_json
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
@ -87,6 +87,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
|
||||||
gold = GoldParse(doc, entities=biluo_tags) # noqa: F841
|
gold = GoldParse(doc, entities=biluo_tags) # noqa: F841
|
||||||
|
|
||||||
|
|
||||||
|
def test_iob_to_biluo():
|
||||||
|
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
||||||
|
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
||||||
|
bad_iob = ["O", "O", "\"", "B-LOC", "I-LOC"]
|
||||||
|
converted_biluo = iob_to_biluo(good_iob)
|
||||||
|
assert good_biluo == converted_biluo
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
iob_to_biluo(bad_iob)
|
||||||
|
|
||||||
|
|
||||||
def test_roundtrip_docs_to_json():
|
def test_roundtrip_docs_to_json():
|
||||||
text = "I flew to Silicon Valley via London."
|
text = "I flew to Silicon Valley via London."
|
||||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||||
|
|
|
@ -166,10 +166,9 @@ cosines are calculated in minibatches, to reduce memory usage.
|
||||||
## Vocab.get_vector {#get_vector tag="method" new="2"}
|
## Vocab.get_vector {#get_vector tag="method" new="2"}
|
||||||
|
|
||||||
Retrieve a vector for a word in the vocabulary. Words can be looked up by string
|
Retrieve a vector for a word in the vocabulary. Words can be looked up by string
|
||||||
or hash value. If no vectors data is loaded, a `ValueError` is raised.
|
or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn`
|
||||||
|
is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s
|
||||||
If `minn` is defined, then the resulting vector uses Fasttext's
|
subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
|
||||||
subword features by average over ngrams of `orth`. (Introduced in spaCy `v2.1`)
|
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -178,12 +177,12 @@ subword features by average over ngrams of `orth`. (Introduced in spaCy `v2.1`)
|
||||||
> nlp.vocab.get_vector("apple", minn=1, maxn=5)
|
> nlp.vocab.get_vector("apple", minn=1, maxn=5)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- |
|
| ----------------------------------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- |
|
||||||
| `orth` | int / unicode | The hash value of a word, or its unicode string. |
|
| `orth` | int / unicode | The hash value of a word, or its unicode string. |
|
||||||
| `minn` | int | Minimum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. |
|
| `minn` <Tag variant="new">2.1</Tag> | int | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. |
|
||||||
| `maxn` | int | Maximum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. |
|
| `maxn` <Tag variant="new">2.1</Tag> | int | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. |
|
||||||
| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance. |
|
| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance. |
|
||||||
|
|
||||||
## Vocab.set_vector {#set_vector tag="method" new="2"}
|
## Vocab.set_vector {#set_vector tag="method" new="2"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user