mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Tidy up and fix issues
This commit is contained in:
		
							parent
							
								
									de11ea753a
								
							
						
					
					
						commit
						1278161f47
					
				| 
						 | 
					@ -235,7 +235,7 @@ def example_from_conllu_sentence(
 | 
				
			||||||
            subtok_word = ""
 | 
					            subtok_word = ""
 | 
				
			||||||
            in_subtok = False
 | 
					            in_subtok = False
 | 
				
			||||||
        id_ = int(id_) - 1
 | 
					        id_ = int(id_) - 1
 | 
				
			||||||
        head = (int(head) - 1) if head != "0" else id_
 | 
					        head = (int(head) - 1) if head not in ("0", "_") else id_
 | 
				
			||||||
        tag = pos if tag == "_" else tag
 | 
					        tag = pos if tag == "_" else tag
 | 
				
			||||||
        morph = morph if morph != "_" else ""
 | 
					        morph = morph if morph != "_" else ""
 | 
				
			||||||
        dep = "ROOT" if dep == "root" else dep
 | 
					        dep = "ROOT" if dep == "root" else dep
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -541,8 +541,8 @@ class Errors(object):
 | 
				
			||||||
    E997 = ("Tokenizer special cases are not allowed to modify the text. "
 | 
					    E997 = ("Tokenizer special cases are not allowed to modify the text. "
 | 
				
			||||||
            "This would map '{chunk}' to '{orth}' given token attributes "
 | 
					            "This would map '{chunk}' to '{orth}' given token attributes "
 | 
				
			||||||
            "'{token_attrs}'.")
 | 
					            "'{token_attrs}'.")
 | 
				
			||||||
    E998 = ("Can only create GoldParse's from Example's without a Doc, "
 | 
					    E998 = ("Can only create GoldParse objects from Example objects without a "
 | 
				
			||||||
            "if get_gold_parses() is called with a Vocab object.")
 | 
					            "Doc if get_gold_parses() is called with a Vocab object.")
 | 
				
			||||||
    E999 = ("Encountered an unexpected format for the dictionary holding "
 | 
					    E999 = ("Encountered an unexpected format for the dictionary holding "
 | 
				
			||||||
            "gold annotations: {gold_dict}")
 | 
					            "gold annotations: {gold_dict}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -991,11 +991,6 @@ cdef class GoldParse:
 | 
				
			||||||
        self.cats = {} if cats is None else dict(cats)
 | 
					        self.cats = {} if cats is None else dict(cats)
 | 
				
			||||||
        self.links = {} if links is None else dict(links)
 | 
					        self.links = {} if links is None else dict(links)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # orig_annot is used as an iterator in `nlp.evalate` even if self.length == 0,
 | 
					 | 
				
			||||||
        # so set a empty list to avoid error.
 | 
					 | 
				
			||||||
        # if self.lenght > 0, this is modified latter.
 | 
					 | 
				
			||||||
        self.orig_annot = []
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # avoid allocating memory if the doc does not contain any tokens
 | 
					        # avoid allocating memory if the doc does not contain any tokens
 | 
				
			||||||
        if self.length > 0:
 | 
					        if self.length > 0:
 | 
				
			||||||
            if not words:
 | 
					            if not words:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_num_words = [
 | 
					_num_words = [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,5 @@
 | 
				
			||||||
# coding: utf8
 | 
					from ...symbols import POS, AUX, ADJ, CCONJ, NUM, ADV, ADP, X, VERB
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from ...symbols import NOUN, PART, INTJ, PRON
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...symbols import POS, AUX, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
 | 
					 | 
				
			||||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html
 | 
					# Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,7 +77,7 @@ cdef class Parser:
 | 
				
			||||||
        tok2vec = Tok2Vec(width=token_vector_width,
 | 
					        tok2vec = Tok2Vec(width=token_vector_width,
 | 
				
			||||||
                          embed_size=embed_size,
 | 
					                          embed_size=embed_size,
 | 
				
			||||||
                          conv_depth=conv_depth,
 | 
					                          conv_depth=conv_depth,
 | 
				
			||||||
                          window_size=window_size,
 | 
					                          window_size=conv_window,
 | 
				
			||||||
                          cnn_maxout_pieces=t2v_pieces,
 | 
					                          cnn_maxout_pieces=t2v_pieces,
 | 
				
			||||||
                          subword_features=subword_features,
 | 
					                          subword_features=subword_features,
 | 
				
			||||||
                          pretrained_vectors=pretrained_vectors,
 | 
					                          pretrained_vectors=pretrained_vectors,
 | 
				
			||||||
| 
						 | 
					@ -105,7 +105,7 @@ cdef class Parser:
 | 
				
			||||||
            'bilstm_depth': bilstm_depth,
 | 
					            'bilstm_depth': bilstm_depth,
 | 
				
			||||||
            'self_attn_depth': self_attn_depth,
 | 
					            'self_attn_depth': self_attn_depth,
 | 
				
			||||||
            'conv_depth': conv_depth,
 | 
					            'conv_depth': conv_depth,
 | 
				
			||||||
            'window_size': window_size,
 | 
					            'window_size': conv_window,
 | 
				
			||||||
            'embed_size': embed_size,
 | 
					            'embed_size': embed_size,
 | 
				
			||||||
            'cnn_maxout_pieces': t2v_pieces
 | 
					            'cnn_maxout_pieces': t2v_pieces
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
from spacy.pipeline import EntityRuler
 | 
					from spacy.pipeline import EntityRuler
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,11 +6,12 @@ def test_issue4849():
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ruler = EntityRuler(
 | 
					    ruler = EntityRuler(
 | 
				
			||||||
        nlp, patterns=[
 | 
					        nlp,
 | 
				
			||||||
            {"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
 | 
					        patterns=[
 | 
				
			||||||
            {"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
 | 
					            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
 | 
				
			||||||
 | 
					            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
 | 
				
			||||||
        ],
 | 
					        ],
 | 
				
			||||||
        phrase_matcher_attr="LOWER"
 | 
					        phrase_matcher_attr="LOWER",
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    nlp.add_pipe(ruler)
 | 
					    nlp.add_pipe(ruler)
 | 
				
			||||||
| 
						 | 
					@ -27,10 +25,10 @@ def test_issue4849():
 | 
				
			||||||
    count_ents = 0
 | 
					    count_ents = 0
 | 
				
			||||||
    for doc in nlp.pipe([text], n_process=1):
 | 
					    for doc in nlp.pipe([text], n_process=1):
 | 
				
			||||||
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
 | 
					        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
 | 
				
			||||||
    assert(count_ents == 2)
 | 
					    assert count_ents == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # USING 2 PROCESSES
 | 
					    # USING 2 PROCESSES
 | 
				
			||||||
    count_ents = 0
 | 
					    count_ents = 0
 | 
				
			||||||
    for doc in nlp.pipe([text], n_process=2):
 | 
					    for doc in nlp.pipe([text], n_process=2):
 | 
				
			||||||
        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
 | 
					        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
 | 
				
			||||||
    assert (count_ents == 2)
 | 
					    assert count_ents == 2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,16 +1,9 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					from spacy.language import Language
 | 
				
			||||||
import spacy
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					def test_evaluate():
 | 
				
			||||||
def nlp():
 | 
					    nlp = Language()
 | 
				
			||||||
    return spacy.blank("en")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_evaluate(nlp):
 | 
					 | 
				
			||||||
    docs_golds = [("", {})]
 | 
					    docs_golds = [("", {})]
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        nlp.evaluate(docs_golds)
 | 
					        nlp.evaluate(docs_golds)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user