mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'master' into spacy.io
This commit is contained in:
		
						commit
						2f223a5dd8
					
				| 
						 | 
					@ -206,6 +206,9 @@ def debug_data(
 | 
				
			||||||
                missing_values, "value" if missing_values == 1 else "values"
 | 
					                missing_values, "value" if missing_values == 1 else "values"
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					        for label in new_labels:
 | 
				
			||||||
 | 
					            if len(label) == 0:
 | 
				
			||||||
 | 
					                msg.fail("Empty label found in new labels")
 | 
				
			||||||
        if new_labels:
 | 
					        if new_labels:
 | 
				
			||||||
            labels_with_counts = [
 | 
					            labels_with_counts = [
 | 
				
			||||||
                (label, count)
 | 
					                (label, count)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -503,6 +503,7 @@ class Errors(object):
 | 
				
			||||||
            "names: {names}")
 | 
					            "names: {names}")
 | 
				
			||||||
    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
 | 
					    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
 | 
				
			||||||
    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
 | 
					    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
 | 
				
			||||||
 | 
					    E177 = ("Ill-formed IOB input detected: {tag}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -522,6 +522,8 @@ def _consume_ent(tags):
 | 
				
			||||||
        tags.pop(0)
 | 
					        tags.pop(0)
 | 
				
			||||||
    label = tag[2:]
 | 
					    label = tag[2:]
 | 
				
			||||||
    if length == 1:
 | 
					    if length == 1:
 | 
				
			||||||
 | 
					        if len(label) == 0:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E177.format(tag=tag))
 | 
				
			||||||
        return ["U-" + label]
 | 
					        return ["U-" + label]
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        start = "B-" + label
 | 
					        start = "B-" + label
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,7 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
 | 
					from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
 | 
				
			||||||
from spacy.gold import spans_from_biluo_tags, GoldParse
 | 
					from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
 | 
				
			||||||
from spacy.gold import GoldCorpus, docs_to_json
 | 
					from spacy.gold import GoldCorpus, docs_to_json
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
| 
						 | 
					@ -87,6 +87,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
 | 
				
			||||||
    gold = GoldParse(doc, entities=biluo_tags)  # noqa: F841
 | 
					    gold = GoldParse(doc, entities=biluo_tags)  # noqa: F841
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_iob_to_biluo():
 | 
				
			||||||
 | 
					    good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
 | 
				
			||||||
 | 
					    good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
 | 
				
			||||||
 | 
					    bad_iob = ["O", "O", "\"", "B-LOC", "I-LOC"]
 | 
				
			||||||
 | 
					    converted_biluo = iob_to_biluo(good_iob)
 | 
				
			||||||
 | 
					    assert good_biluo == converted_biluo
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        iob_to_biluo(bad_iob)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_roundtrip_docs_to_json():
 | 
					def test_roundtrip_docs_to_json():
 | 
				
			||||||
    text = "I flew to Silicon Valley via London."
 | 
					    text = "I flew to Silicon Valley via London."
 | 
				
			||||||
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
 | 
					    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -166,10 +166,9 @@ cosines are calculated in minibatches, to reduce memory usage.
 | 
				
			||||||
## Vocab.get_vector {#get_vector tag="method" new="2"}
 | 
					## Vocab.get_vector {#get_vector tag="method" new="2"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Retrieve a vector for a word in the vocabulary. Words can be looked up by string
 | 
					Retrieve a vector for a word in the vocabulary. Words can be looked up by string
 | 
				
			||||||
or hash value. If no vectors data is loaded, a `ValueError` is raised.
 | 
					or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn`
 | 
				
			||||||
 | 
					is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s
 | 
				
			||||||
If `minn` is defined, then the resulting vector uses Fasttext's 
 | 
					subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
 | 
				
			||||||
subword features by average over ngrams of `orth`. (Introduced in spaCy `v2.1`)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					@ -179,10 +178,10 @@ subword features by average over ngrams of `orth`. (Introduced in spaCy `v2.1`)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                                | Type                                     | Description                                                                                    |
 | 
					| Name                                | Type                                     | Description                                                                                    |
 | 
				
			||||||
| ----------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- |
 | 
					| ----------------------------------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `orth`                              | int / unicode                            | The hash value of a word, or its unicode string.                                               |
 | 
					| `orth`                              | int / unicode                            | The hash value of a word, or its unicode string.                                               |
 | 
				
			||||||
| `minn`      | int                                      | Minimum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. |
 | 
					| `minn` <Tag variant="new">2.1</Tag> | int                                      | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. |
 | 
				
			||||||
| `maxn`      | int                                      | Maximum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. |
 | 
					| `maxn` <Tag variant="new">2.1</Tag> | int                                      | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. |
 | 
				
			||||||
| **RETURNS**                         | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance.                  |
 | 
					| **RETURNS**                         | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance.                  |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Vocab.set_vector {#set_vector tag="method" new="2"}
 | 
					## Vocab.set_vector {#set_vector tag="method" new="2"}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user