mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Move GoldParse under spacy.syntax
This commit is contained in:
		
							parent
							
								
									32c8fb1372
								
							
						
					
					
						commit
						7b873ce2b1
					
				| 
						 | 
					@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
from ..typedefs cimport weight_t, attr_t
 | 
					from ..typedefs cimport weight_t, attr_t
 | 
				
			||||||
from .transition_system cimport TransitionSystem, Transition
 | 
					from .transition_system cimport TransitionSystem, Transition
 | 
				
			||||||
from ..gold cimport GoldParseC
 | 
					from .gold_parse cimport GoldParseC
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class ArcEager(TransitionSystem):
 | 
					cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,7 +8,7 @@ import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..typedefs cimport hash_t, attr_t
 | 
					from ..typedefs cimport hash_t, attr_t
 | 
				
			||||||
from ..strings cimport hash_string
 | 
					from ..strings cimport hash_string
 | 
				
			||||||
from ..gold cimport GoldParse, GoldParseC
 | 
					from .gold_parse cimport GoldParse, GoldParseC
 | 
				
			||||||
from ..structs cimport TokenC
 | 
					from ..structs cimport TokenC
 | 
				
			||||||
from ..tokens.doc cimport Doc, set_children_from_heads
 | 
					from ..tokens.doc cimport Doc, set_children_from_heads
 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										39
									
								
								spacy/syntax/gold_parse.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								spacy/syntax/gold_parse.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,39 @@
 | 
				
			||||||
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
 | 
					from .transition_system cimport Transition
 | 
				
			||||||
 | 
					from ..typedefs cimport attr_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef struct GoldParseC:
 | 
				
			||||||
 | 
					    int* tags
 | 
				
			||||||
 | 
					    int* heads
 | 
				
			||||||
 | 
					    int* has_dep
 | 
				
			||||||
 | 
					    int* sent_start
 | 
				
			||||||
 | 
					    attr_t* labels
 | 
				
			||||||
 | 
					    int** brackets
 | 
				
			||||||
 | 
					    Transition* ner
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class GoldParse:
 | 
				
			||||||
 | 
					    cdef Pool mem
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef GoldParseC c
 | 
				
			||||||
 | 
					    cdef readonly object orig
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int length
 | 
				
			||||||
 | 
					    cdef public int loss
 | 
				
			||||||
 | 
					    cdef public list words
 | 
				
			||||||
 | 
					    cdef public list tags
 | 
				
			||||||
 | 
					    cdef public list pos
 | 
				
			||||||
 | 
					    cdef public list morphs
 | 
				
			||||||
 | 
					    cdef public list lemmas
 | 
				
			||||||
 | 
					    cdef public list sent_starts
 | 
				
			||||||
 | 
					    cdef public list heads
 | 
				
			||||||
 | 
					    cdef public list labels
 | 
				
			||||||
 | 
					    cdef public dict orths
 | 
				
			||||||
 | 
					    cdef public list ner
 | 
				
			||||||
 | 
					    cdef public dict brackets
 | 
				
			||||||
 | 
					    cdef public dict cats
 | 
				
			||||||
 | 
					    cdef public dict links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef readonly list cand_to_gold
 | 
				
			||||||
 | 
					    cdef readonly list gold_to_cand
 | 
				
			||||||
							
								
								
									
										311
									
								
								spacy/syntax/gold_parse.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										311
									
								
								spacy/syntax/gold_parse.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,311 @@
 | 
				
			||||||
 | 
					# cython: profile=True
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
 | 
					import tempfile
 | 
				
			||||||
 | 
					import shutil
 | 
				
			||||||
 | 
					import itertools
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..syntax import nonproj
 | 
				
			||||||
 | 
					from ..tokens import Doc, Span
 | 
				
			||||||
 | 
					from ..errors import Errors, AlignmentError, Warnings
 | 
				
			||||||
 | 
					from .iob_utils import offsets_from_biluo_tags
 | 
				
			||||||
 | 
					from .align import align
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					punct_re = re.compile(r"\W")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_punct_label(label):
 | 
				
			||||||
 | 
					    return label == "P" or label.lower() == "punct"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class GoldParse:
 | 
				
			||||||
 | 
					    """Collection for training annotations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    DOCS: https://spacy.io/api/goldparse
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
 | 
				
			||||||
 | 
					        return cls(doc, words=token_annotation.words,
 | 
				
			||||||
 | 
					                   tags=token_annotation.tags,
 | 
				
			||||||
 | 
					                   pos=token_annotation.pos,
 | 
				
			||||||
 | 
					                   morphs=token_annotation.morphs,
 | 
				
			||||||
 | 
					                   lemmas=token_annotation.lemmas,
 | 
				
			||||||
 | 
					                   heads=token_annotation.heads,
 | 
				
			||||||
 | 
					                   deps=token_annotation.deps,
 | 
				
			||||||
 | 
					                   entities=token_annotation.entities,
 | 
				
			||||||
 | 
					                   sent_starts=token_annotation.sent_starts,
 | 
				
			||||||
 | 
					                   cats=doc_annotation.cats,
 | 
				
			||||||
 | 
					                   links=doc_annotation.links,
 | 
				
			||||||
 | 
					                   make_projective=make_projective)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_token_annotation(self):
 | 
				
			||||||
 | 
					        ids = None
 | 
				
			||||||
 | 
					        if self.words:
 | 
				
			||||||
 | 
					            ids = list(range(len(self.words)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
 | 
				
			||||||
 | 
					                               pos=self.pos, morphs=self.morphs,
 | 
				
			||||||
 | 
					                               lemmas=self.lemmas, heads=self.heads,
 | 
				
			||||||
 | 
					                               deps=self.labels, entities=self.ner,
 | 
				
			||||||
 | 
					                               sent_starts=self.sent_starts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
 | 
				
			||||||
 | 
					                 lemmas=None, heads=None, deps=None, entities=None,
 | 
				
			||||||
 | 
					                 sent_starts=None, make_projective=False, cats=None,
 | 
				
			||||||
 | 
					                 links=None):
 | 
				
			||||||
 | 
					        """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        doc (Doc): The document the annotations refer to.
 | 
				
			||||||
 | 
					        words (iterable): A sequence of unicode word strings.
 | 
				
			||||||
 | 
					        tags (iterable): A sequence of strings, representing tag annotations.
 | 
				
			||||||
 | 
					        pos (iterable): A sequence of strings, representing UPOS annotations.
 | 
				
			||||||
 | 
					        morphs (iterable): A sequence of strings, representing morph
 | 
				
			||||||
 | 
					            annotations.
 | 
				
			||||||
 | 
					        lemmas (iterable): A sequence of strings, representing lemma
 | 
				
			||||||
 | 
					            annotations.
 | 
				
			||||||
 | 
					        heads (iterable): A sequence of integers, representing syntactic
 | 
				
			||||||
 | 
					            head offsets.
 | 
				
			||||||
 | 
					        deps (iterable): A sequence of strings, representing the syntactic
 | 
				
			||||||
 | 
					            relation types.
 | 
				
			||||||
 | 
					        entities (iterable): A sequence of named entity annotations, either as
 | 
				
			||||||
 | 
					            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
 | 
				
			||||||
 | 
					            representing the entity positions.
 | 
				
			||||||
 | 
					        sent_starts (iterable): A sequence of sentence position tags, 1 for
 | 
				
			||||||
 | 
					            the first word in a sentence, 0 for all others.
 | 
				
			||||||
 | 
					        cats (dict): Labels for text classification. Each key in the dictionary
 | 
				
			||||||
 | 
					            may be a string or an int, or a `(start_char, end_char, label)`
 | 
				
			||||||
 | 
					            tuple, indicating that the label is applied to only part of the
 | 
				
			||||||
 | 
					            document (usually a sentence). Unlike entity annotations, label
 | 
				
			||||||
 | 
					            annotations can overlap, i.e. a single word can be covered by
 | 
				
			||||||
 | 
					            multiple labelled spans. The TextCategorizer component expects
 | 
				
			||||||
 | 
					            true examples of a label to have the value 1.0, and negative
 | 
				
			||||||
 | 
					            examples of a label to have the value 0.0. Labels not in the
 | 
				
			||||||
 | 
					            dictionary are treated as missing - the gradient for those labels
 | 
				
			||||||
 | 
					            will be zero.
 | 
				
			||||||
 | 
					        links (dict): A dict with `(start_char, end_char)` keys,
 | 
				
			||||||
 | 
					            and the values being dicts with kb_id:value entries,
 | 
				
			||||||
 | 
					            representing the external IDs in a knowledge base (KB)
 | 
				
			||||||
 | 
					            mapped to either 1.0 or 0.0, indicating positive and
 | 
				
			||||||
 | 
					            negative examples respectively.
 | 
				
			||||||
 | 
					        RETURNS (GoldParse): The newly constructed object.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        self.mem = Pool()
 | 
				
			||||||
 | 
					        self.loss = 0
 | 
				
			||||||
 | 
					        self.length = len(doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.cats = {} if cats is None else dict(cats)
 | 
				
			||||||
 | 
					        self.links = {} if links is None else dict(links)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # temporary doc for aligning entity annotation
 | 
				
			||||||
 | 
					        entdoc = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # avoid allocating memory if the doc does not contain any tokens
 | 
				
			||||||
 | 
					        if self.length == 0:
 | 
				
			||||||
 | 
					            self.words = []
 | 
				
			||||||
 | 
					            self.tags = []
 | 
				
			||||||
 | 
					            self.heads = []
 | 
				
			||||||
 | 
					            self.labels = []
 | 
				
			||||||
 | 
					            self.ner = []
 | 
				
			||||||
 | 
					            self.morphs = []
 | 
				
			||||||
 | 
					            # set a minimal orig so that the scorer can score an empty doc
 | 
				
			||||||
 | 
					            self.orig = TokenAnnotation(ids=[])
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            if not words:
 | 
				
			||||||
 | 
					                words = [token.text for token in doc]
 | 
				
			||||||
 | 
					            if not tags:
 | 
				
			||||||
 | 
					                tags = [None for _ in words]
 | 
				
			||||||
 | 
					            if not pos:
 | 
				
			||||||
 | 
					                pos = [None for _ in words]
 | 
				
			||||||
 | 
					            if not morphs:
 | 
				
			||||||
 | 
					                morphs = [None for _ in words]
 | 
				
			||||||
 | 
					            if not lemmas:
 | 
				
			||||||
 | 
					                lemmas = [None for _ in words]
 | 
				
			||||||
 | 
					            if not heads:
 | 
				
			||||||
 | 
					                heads = [None for _ in words]
 | 
				
			||||||
 | 
					            if not deps:
 | 
				
			||||||
 | 
					                deps = [None for _ in words]
 | 
				
			||||||
 | 
					            if not sent_starts:
 | 
				
			||||||
 | 
					                sent_starts = [None for _ in words]
 | 
				
			||||||
 | 
					            if entities is None:
 | 
				
			||||||
 | 
					                entities = ["-" for _ in words]
 | 
				
			||||||
 | 
					            elif len(entities) == 0:
 | 
				
			||||||
 | 
					                entities = ["O" for _ in words]
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                # Translate the None values to '-', to make processing easier.
 | 
				
			||||||
 | 
					                # See Issue #2603
 | 
				
			||||||
 | 
					                entities = [(ent if ent is not None else "-") for ent in entities]
 | 
				
			||||||
 | 
					                if not isinstance(entities[0], str):
 | 
				
			||||||
 | 
					                    # Assume we have entities specified by character offset.
 | 
				
			||||||
 | 
					                    # Create a temporary Doc corresponding to provided words
 | 
				
			||||||
 | 
					                    # (to preserve gold tokenization) and text (to preserve
 | 
				
			||||||
 | 
					                    # character offsets).
 | 
				
			||||||
 | 
					                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
 | 
				
			||||||
 | 
					                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
 | 
				
			||||||
 | 
					                    entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
 | 
				
			||||||
 | 
					                    # There may be some additional whitespace tokens in the
 | 
				
			||||||
 | 
					                    # temporary doc, so check that the annotations align with
 | 
				
			||||||
 | 
					                    # the provided words while building a list of BILUO labels.
 | 
				
			||||||
 | 
					                    entities = []
 | 
				
			||||||
 | 
					                    words_offset = 0
 | 
				
			||||||
 | 
					                    for i in range(len(entdoc_words)):
 | 
				
			||||||
 | 
					                        if words[i + words_offset] == entdoc_words[i]:
 | 
				
			||||||
 | 
					                            entities.append(entdoc_entities[i])
 | 
				
			||||||
 | 
					                        else:
 | 
				
			||||||
 | 
					                            words_offset -= 1
 | 
				
			||||||
 | 
					                    if len(entities) != len(words):
 | 
				
			||||||
 | 
					                        warnings.warn(Warnings.W029.format(text=doc.text))
 | 
				
			||||||
 | 
					                        entities = ["-" for _ in words]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # These are filled by the tagger/parser/entity recogniser
 | 
				
			||||||
 | 
					            self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
 | 
				
			||||||
 | 
					            self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
 | 
				
			||||||
 | 
					            self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
 | 
				
			||||||
 | 
					            self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
 | 
				
			||||||
 | 
					            self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
 | 
				
			||||||
 | 
					            self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            self.words = [None] * len(doc)
 | 
				
			||||||
 | 
					            self.tags = [None] * len(doc)
 | 
				
			||||||
 | 
					            self.pos = [None] * len(doc)
 | 
				
			||||||
 | 
					            self.morphs = [None] * len(doc)
 | 
				
			||||||
 | 
					            self.lemmas = [None] * len(doc)
 | 
				
			||||||
 | 
					            self.heads = [None] * len(doc)
 | 
				
			||||||
 | 
					            self.labels = [None] * len(doc)
 | 
				
			||||||
 | 
					            self.ner = [None] * len(doc)
 | 
				
			||||||
 | 
					            self.sent_starts = [None] * len(doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # This needs to be done before we align the words
 | 
				
			||||||
 | 
					            if make_projective and any(heads) and any(deps) :
 | 
				
			||||||
 | 
					                heads, deps = nonproj.projectivize(heads, deps)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Do many-to-one alignment for misaligned tokens.
 | 
				
			||||||
 | 
					            # If we over-segment, we'll have one gold word that covers a sequence
 | 
				
			||||||
 | 
					            # of predicted words
 | 
				
			||||||
 | 
					            # If we under-segment, we'll have one predicted word that covers a
 | 
				
			||||||
 | 
					            # sequence of gold words.
 | 
				
			||||||
 | 
					            # If we "mis-segment", we'll have a sequence of predicted words covering
 | 
				
			||||||
 | 
					            # a sequence of gold words. That's many-to-many -- we don't do that
 | 
				
			||||||
 | 
					            # except for NER spans where the start and end can be aligned.
 | 
				
			||||||
 | 
					            cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
 | 
				
			||||||
 | 
					            self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            self.orig = TokenAnnotation(ids=list(range(len(words))),
 | 
				
			||||||
 | 
					                    words=words, tags=tags, pos=pos, morphs=morphs,
 | 
				
			||||||
 | 
					                    lemmas=lemmas, heads=heads, deps=deps, entities=entities,
 | 
				
			||||||
 | 
					                    sent_starts=sent_starts, brackets=[])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            for i, gold_i in enumerate(self.cand_to_gold):
 | 
				
			||||||
 | 
					                if doc[i].text.isspace():
 | 
				
			||||||
 | 
					                    self.words[i] = doc[i].text
 | 
				
			||||||
 | 
					                    self.tags[i] = "_SP"
 | 
				
			||||||
 | 
					                    self.pos[i] = "SPACE"
 | 
				
			||||||
 | 
					                    self.morphs[i] = None
 | 
				
			||||||
 | 
					                    self.lemmas[i] = None
 | 
				
			||||||
 | 
					                    self.heads[i] = None
 | 
				
			||||||
 | 
					                    self.labels[i] = None
 | 
				
			||||||
 | 
					                    self.ner[i] = None
 | 
				
			||||||
 | 
					                    self.sent_starts[i] = 0
 | 
				
			||||||
 | 
					                if gold_i is None:
 | 
				
			||||||
 | 
					                    if i in i2j_multi:
 | 
				
			||||||
 | 
					                        self.words[i] = words[i2j_multi[i]]
 | 
				
			||||||
 | 
					                        self.tags[i] = tags[i2j_multi[i]]
 | 
				
			||||||
 | 
					                        self.pos[i] = pos[i2j_multi[i]]
 | 
				
			||||||
 | 
					                        self.morphs[i] = morphs[i2j_multi[i]]
 | 
				
			||||||
 | 
					                        self.lemmas[i] = lemmas[i2j_multi[i]]
 | 
				
			||||||
 | 
					                        self.sent_starts[i] = sent_starts[i2j_multi[i]]
 | 
				
			||||||
 | 
					                        is_last = i2j_multi[i] != i2j_multi.get(i+1)
 | 
				
			||||||
 | 
					                        # Set next word in multi-token span as head, until last
 | 
				
			||||||
 | 
					                        if not is_last:
 | 
				
			||||||
 | 
					                            self.heads[i] = i+1
 | 
				
			||||||
 | 
					                            self.labels[i] = "subtok"
 | 
				
			||||||
 | 
					                        else:
 | 
				
			||||||
 | 
					                            head_i = heads[i2j_multi[i]]
 | 
				
			||||||
 | 
					                            if head_i:
 | 
				
			||||||
 | 
					                                self.heads[i] = self.gold_to_cand[head_i]
 | 
				
			||||||
 | 
					                            self.labels[i] = deps[i2j_multi[i]]
 | 
				
			||||||
 | 
					                        ner_tag = entities[i2j_multi[i]]
 | 
				
			||||||
 | 
					                        # Assign O/- for many-to-one O/- NER tags
 | 
				
			||||||
 | 
					                        if ner_tag in ("O", "-"):
 | 
				
			||||||
 | 
					                             self.ner[i] = ner_tag
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    self.words[i] = words[gold_i]
 | 
				
			||||||
 | 
					                    self.tags[i] = tags[gold_i]
 | 
				
			||||||
 | 
					                    self.pos[i] = pos[gold_i]
 | 
				
			||||||
 | 
					                    self.morphs[i] = morphs[gold_i]
 | 
				
			||||||
 | 
					                    self.lemmas[i] = lemmas[gold_i]
 | 
				
			||||||
 | 
					                    self.sent_starts[i] = sent_starts[gold_i]
 | 
				
			||||||
 | 
					                    if heads[gold_i] is None:
 | 
				
			||||||
 | 
					                        self.heads[i] = None
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        self.heads[i] = self.gold_to_cand[heads[gold_i]]
 | 
				
			||||||
 | 
					                    self.labels[i] = deps[gold_i]
 | 
				
			||||||
 | 
					                    self.ner[i] = entities[gold_i]
 | 
				
			||||||
 | 
					            # Assign O/- for one-to-many O/- NER tags
 | 
				
			||||||
 | 
					            for j, cand_j in enumerate(self.gold_to_cand):
 | 
				
			||||||
 | 
					                if cand_j is None:
 | 
				
			||||||
 | 
					                    if j in j2i_multi:
 | 
				
			||||||
 | 
					                        i = j2i_multi[j]
 | 
				
			||||||
 | 
					                        ner_tag = entities[j]
 | 
				
			||||||
 | 
					                        if ner_tag in ("O", "-"):
 | 
				
			||||||
 | 
					                            self.ner[i] = ner_tag
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # If there is entity annotation and some tokens remain unaligned,
 | 
				
			||||||
 | 
					            # align all entities at the character level to account for all
 | 
				
			||||||
 | 
					            # possible token misalignments within the entity spans
 | 
				
			||||||
 | 
					            if any([e not in ("O", "-") for e in entities]) and None in self.ner:
 | 
				
			||||||
 | 
					                # If the temporary entdoc wasn't created above, initialize it
 | 
				
			||||||
 | 
					                if not entdoc:
 | 
				
			||||||
 | 
					                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
 | 
				
			||||||
 | 
					                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
 | 
				
			||||||
 | 
					                # Get offsets based on gold words and BILUO entities
 | 
				
			||||||
 | 
					                entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
 | 
				
			||||||
 | 
					                aligned_offsets = []
 | 
				
			||||||
 | 
					                aligned_spans = []
 | 
				
			||||||
 | 
					                # Filter offsets to identify those that align with doc tokens
 | 
				
			||||||
 | 
					                for offset in entdoc_offsets:
 | 
				
			||||||
 | 
					                    span = doc.char_span(offset[0], offset[1])
 | 
				
			||||||
 | 
					                    if span and not span.text.isspace():
 | 
				
			||||||
 | 
					                        aligned_offsets.append(offset)
 | 
				
			||||||
 | 
					                        aligned_spans.append(span)
 | 
				
			||||||
 | 
					                # Convert back to BILUO for doc tokens and assign NER for all
 | 
				
			||||||
 | 
					                # aligned spans
 | 
				
			||||||
 | 
					                biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
 | 
				
			||||||
 | 
					                for span in aligned_spans:
 | 
				
			||||||
 | 
					                    for i in range(span.start, span.end):
 | 
				
			||||||
 | 
					                        self.ner[i] = biluo_tags[i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Prevent whitespace that isn't within entities from being tagged as
 | 
				
			||||||
 | 
					            # an entity.
 | 
				
			||||||
 | 
					            for i in range(len(self.ner)):
 | 
				
			||||||
 | 
					                if self.tags[i] == "_SP":
 | 
				
			||||||
 | 
					                    prev_ner = self.ner[i-1] if i >= 1 else None
 | 
				
			||||||
 | 
					                    next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
 | 
				
			||||||
 | 
					                    if prev_ner == "O" or next_ner == "O":
 | 
				
			||||||
 | 
					                        self.ner[i] = "O"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            cycle = nonproj.contains_cycle(self.heads)
 | 
				
			||||||
 | 
					            if cycle is not None:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E069.format(cycle=cycle,
 | 
				
			||||||
 | 
					                    cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
 | 
				
			||||||
 | 
					                    doc_tokens=" ".join(words[:50])))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __len__(self):
 | 
				
			||||||
 | 
					        """Get the number of gold-standard tokens.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        RETURNS (int): The number of gold-standard tokens.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        return self.length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def is_projective(self):
 | 
				
			||||||
 | 
					        """Whether the provided syntactic annotations form a projective
 | 
				
			||||||
 | 
					        dependency tree.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        return not nonproj.is_nonproj_tree(self.heads)
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from .transition_system cimport TransitionSystem
 | 
					from .transition_system cimport TransitionSystem
 | 
				
			||||||
from .transition_system cimport Transition
 | 
					from .transition_system cimport Transition
 | 
				
			||||||
from ..gold cimport GoldParseC
 | 
					from .gold_parse cimport GoldParseC
 | 
				
			||||||
from ..typedefs cimport attr_t
 | 
					from ..typedefs cimport attr_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,7 +7,7 @@ from .stateclass cimport StateClass
 | 
				
			||||||
from ._state cimport StateC
 | 
					from ._state cimport StateC
 | 
				
			||||||
from .transition_system cimport Transition
 | 
					from .transition_system cimport Transition
 | 
				
			||||||
from .transition_system cimport do_func_t
 | 
					from .transition_system cimport do_func_t
 | 
				
			||||||
from ..gold cimport GoldParseC, GoldParse
 | 
					from .gold_parse cimport GoldParseC, GoldParse
 | 
				
			||||||
from ..lexeme cimport Lexeme
 | 
					from ..lexeme cimport Lexeme
 | 
				
			||||||
from ..attrs cimport IS_SPACE
 | 
					from ..attrs cimport IS_SPACE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,7 +20,7 @@ import numpy
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokens.doc cimport Doc
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
from ..gold cimport GoldParse
 | 
					from .gold_parse cimport GoldParse
 | 
				
			||||||
from ..typedefs cimport weight_t, class_t, hash_t
 | 
					from ..typedefs cimport weight_t, class_t, hash_t
 | 
				
			||||||
from ._parser_model cimport alloc_activations, free_activations
 | 
					from ._parser_model cimport alloc_activations, free_activations
 | 
				
			||||||
from ._parser_model cimport predict_states, arg_max_if_valid
 | 
					from ._parser_model cimport predict_states, arg_max_if_valid
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,8 +2,8 @@ from cymem.cymem cimport Pool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..typedefs cimport attr_t, weight_t
 | 
					from ..typedefs cimport attr_t, weight_t
 | 
				
			||||||
from ..structs cimport TokenC
 | 
					from ..structs cimport TokenC
 | 
				
			||||||
from ..gold cimport GoldParse
 | 
					from .gold_parse cimport GoldParse
 | 
				
			||||||
from ..gold cimport GoldParseC
 | 
					from .gold_parse cimport GoldParseC
 | 
				
			||||||
from ..strings cimport StringStore
 | 
					from ..strings cimport StringStore
 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
from ._state cimport StateC
 | 
					from ._state cimport StateC
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user