mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Tidy up gold
This commit is contained in:
		
							parent
							
								
									6a0483b7aa
								
							
						
					
					
						commit
						a6135336f5
					
				| 
						 | 
					@ -54,7 +54,8 @@ def merge_sents(sents):
 | 
				
			||||||
        m_deps[3].extend(head + i for head in heads)
 | 
					        m_deps[3].extend(head + i for head in heads)
 | 
				
			||||||
        m_deps[4].extend(labels)
 | 
					        m_deps[4].extend(labels)
 | 
				
			||||||
        m_deps[5].extend(ner)
 | 
					        m_deps[5].extend(ner)
 | 
				
			||||||
        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
 | 
					        m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])
 | 
				
			||||||
 | 
					                          for b in brackets)
 | 
				
			||||||
        i += len(ids)
 | 
					        i += len(ids)
 | 
				
			||||||
    return [(m_deps, m_brackets)]
 | 
					    return [(m_deps, m_brackets)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -80,6 +81,8 @@ def align(cand_words, gold_words):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
punct_re = re.compile(r'\W')
 | 
					punct_re = re.compile(r'\W')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _min_edit_path(cand_words, gold_words):
 | 
					def _min_edit_path(cand_words, gold_words):
 | 
				
			||||||
    cdef:
 | 
					    cdef:
 | 
				
			||||||
        Pool mem
 | 
					        Pool mem
 | 
				
			||||||
| 
						 | 
					@ -98,9 +101,9 @@ def _min_edit_path(cand_words, gold_words):
 | 
				
			||||||
    mem = Pool()
 | 
					    mem = Pool()
 | 
				
			||||||
    n_cand = len(cand_words)
 | 
					    n_cand = len(cand_words)
 | 
				
			||||||
    n_gold = len(gold_words)
 | 
					    n_gold = len(gold_words)
 | 
				
			||||||
    # Levenshtein distance, except we need the history, and we may want different
 | 
					    # Levenshtein distance, except we need the history, and we may want
 | 
				
			||||||
    # costs.
 | 
					    # different costs. Mark operations with a string, and score the history
 | 
				
			||||||
    # Mark operations with a string, and score the history using _edit_cost.
 | 
					    # using _edit_cost.
 | 
				
			||||||
    previous_row = []
 | 
					    previous_row = []
 | 
				
			||||||
    prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
 | 
					    prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
 | 
				
			||||||
    curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
 | 
					    curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
 | 
				
			||||||
| 
						 | 
					@ -144,9 +147,9 @@ def _min_edit_path(cand_words, gold_words):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def minibatch(items, size=8):
 | 
					def minibatch(items, size=8):
 | 
				
			||||||
    '''Iterate over batches of items. `size` may be an iterator,
 | 
					    """Iterate over batches of items. `size` may be an iterator,
 | 
				
			||||||
    so that batch-size can vary on each step.
 | 
					    so that batch-size can vary on each step.
 | 
				
			||||||
    '''
 | 
					    """
 | 
				
			||||||
    if isinstance(size, int):
 | 
					    if isinstance(size, int):
 | 
				
			||||||
        size_ = itertools.repeat(8)
 | 
					        size_ = itertools.repeat(8)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
| 
						 | 
					@ -168,6 +171,7 @@ class GoldCorpus(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        train_path (unicode or Path): File or directory of training data.
 | 
					        train_path (unicode or Path): File or directory of training data.
 | 
				
			||||||
        dev_path (unicode or Path): File or directory of development data.
 | 
					        dev_path (unicode or Path): File or directory of development data.
 | 
				
			||||||
 | 
					        RETURNS (GoldCorpus): The newly created object.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        self.train_path = util.ensure_path(train_path)
 | 
					        self.train_path = util.ensure_path(train_path)
 | 
				
			||||||
        self.dev_path = util.ensure_path(dev_path)
 | 
					        self.dev_path = util.ensure_path(dev_path)
 | 
				
			||||||
| 
						 | 
					@ -213,7 +217,7 @@ class GoldCorpus(object):
 | 
				
			||||||
        train_tuples = self.train_tuples
 | 
					        train_tuples = self.train_tuples
 | 
				
			||||||
        if projectivize:
 | 
					        if projectivize:
 | 
				
			||||||
            train_tuples = nonproj.preprocess_training_data(
 | 
					            train_tuples = nonproj.preprocess_training_data(
 | 
				
			||||||
                               self.train_tuples, label_freq_cutoff=100)
 | 
					                self.train_tuples, label_freq_cutoff=100)
 | 
				
			||||||
        random.shuffle(train_tuples)
 | 
					        random.shuffle(train_tuples)
 | 
				
			||||||
        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
 | 
					        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
 | 
				
			||||||
                                        max_length=max_length,
 | 
					                                        max_length=max_length,
 | 
				
			||||||
| 
						 | 
					@ -222,7 +226,6 @@ class GoldCorpus(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def dev_docs(self, nlp, gold_preproc=False):
 | 
					    def dev_docs(self, nlp, gold_preproc=False):
 | 
				
			||||||
        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
 | 
					        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
 | 
				
			||||||
        #gold_docs = nlp.preprocess_gold(gold_docs)
 | 
					 | 
				
			||||||
        yield from gold_docs
 | 
					        yield from gold_docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
| 
						 | 
					@ -233,7 +236,6 @@ class GoldCorpus(object):
 | 
				
			||||||
                raw_text = None
 | 
					                raw_text = None
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                paragraph_tuples = merge_sents(paragraph_tuples)
 | 
					                paragraph_tuples = merge_sents(paragraph_tuples)
 | 
				
			||||||
 | 
					 | 
				
			||||||
            docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
 | 
					            docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
 | 
				
			||||||
                                  gold_preproc, noise_level=noise_level)
 | 
					                                  gold_preproc, noise_level=noise_level)
 | 
				
			||||||
            golds = cls._make_golds(docs, paragraph_tuples)
 | 
					            golds = cls._make_golds(docs, paragraph_tuples)
 | 
				
			||||||
| 
						 | 
					@ -248,17 +250,20 @@ class GoldCorpus(object):
 | 
				
			||||||
            raw_text = add_noise(raw_text, noise_level)
 | 
					            raw_text = add_noise(raw_text, noise_level)
 | 
				
			||||||
            return [nlp.make_doc(raw_text)]
 | 
					            return [nlp.make_doc(raw_text)]
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
 | 
					            return [Doc(nlp.vocab,
 | 
				
			||||||
                for (sent_tuples, brackets) in paragraph_tuples]
 | 
					                        words=add_noise(sent_tuples[1], noise_level))
 | 
				
			||||||
 | 
					                    for (sent_tuples, brackets) in paragraph_tuples]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def _make_golds(cls, docs, paragraph_tuples):
 | 
					    def _make_golds(cls, docs, paragraph_tuples):
 | 
				
			||||||
        assert len(docs) == len(paragraph_tuples)
 | 
					        assert len(docs) == len(paragraph_tuples)
 | 
				
			||||||
        if len(docs) == 1:
 | 
					        if len(docs) == 1:
 | 
				
			||||||
            return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
 | 
					            return [GoldParse.from_annot_tuples(docs[0],
 | 
				
			||||||
 | 
					                                                paragraph_tuples[0][0])]
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            return [GoldParse.from_annot_tuples(doc, sent_tuples)
 | 
					            return [GoldParse.from_annot_tuples(doc, sent_tuples)
 | 
				
			||||||
                    for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
 | 
					                    for doc, (sent_tuples, brackets)
 | 
				
			||||||
 | 
					                    in zip(docs, paragraph_tuples)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def walk_corpus(path):
 | 
					    def walk_corpus(path):
 | 
				
			||||||
| 
						 | 
					@ -330,16 +335,16 @@ def read_json_file(loc, docs_filter=None, limit=None):
 | 
				
			||||||
                    for i, token in enumerate(sent['tokens']):
 | 
					                    for i, token in enumerate(sent['tokens']):
 | 
				
			||||||
                        words.append(token['orth'])
 | 
					                        words.append(token['orth'])
 | 
				
			||||||
                        ids.append(i)
 | 
					                        ids.append(i)
 | 
				
			||||||
                        tags.append(token.get('tag','-'))
 | 
					                        tags.append(token.get('tag', '-'))
 | 
				
			||||||
                        heads.append(token.get('head',0) + i)
 | 
					                        heads.append(token.get('head', 0) + i)
 | 
				
			||||||
                        labels.append(token.get('dep',''))
 | 
					                        labels.append(token.get('dep', ''))
 | 
				
			||||||
                        # Ensure ROOT label is case-insensitive
 | 
					                        # Ensure ROOT label is case-insensitive
 | 
				
			||||||
                        if labels[-1].lower() == 'root':
 | 
					                        if labels[-1].lower() == 'root':
 | 
				
			||||||
                            labels[-1] = 'ROOT'
 | 
					                            labels[-1] = 'ROOT'
 | 
				
			||||||
                        ner.append(token.get('ner', '-'))
 | 
					                        ner.append(token.get('ner', '-'))
 | 
				
			||||||
                    sents.append([
 | 
					                    sents.append([
 | 
				
			||||||
                        [ids, words, tags, heads, labels, ner],
 | 
					                        [ids, words, tags, heads, labels, ner],
 | 
				
			||||||
                         sent.get('brackets', [])])
 | 
					                        sent.get('brackets', [])])
 | 
				
			||||||
                if sents:
 | 
					                if sents:
 | 
				
			||||||
                    yield [paragraph.get('raw', None), sents]
 | 
					                    yield [paragraph.get('raw', None), sents]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -382,19 +387,21 @@ cdef class GoldParse:
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
 | 
					    def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
 | 
				
			||||||
        _, words, tags, heads, deps, entities = annot_tuples
 | 
					        _, words, tags, heads, deps, entities = annot_tuples
 | 
				
			||||||
        return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities,
 | 
					        return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
 | 
				
			||||||
                   make_projective=make_projective)
 | 
					                   entities=entities, make_projective=make_projective)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
 | 
					    def __init__(self, doc, annot_tuples=None, words=None, tags=None,
 | 
				
			||||||
                 deps=None, entities=None, make_projective=False,
 | 
					                 heads=None, deps=None, entities=None, make_projective=False,
 | 
				
			||||||
                 cats=None):
 | 
					                 cats=None):
 | 
				
			||||||
        """Create a GoldParse.
 | 
					        """Create a GoldParse.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        doc (Doc): The document the annotations refer to.
 | 
					        doc (Doc): The document the annotations refer to.
 | 
				
			||||||
        words (iterable): A sequence of unicode word strings.
 | 
					        words (iterable): A sequence of unicode word strings.
 | 
				
			||||||
        tags (iterable): A sequence of strings, representing tag annotations.
 | 
					        tags (iterable): A sequence of strings, representing tag annotations.
 | 
				
			||||||
        heads (iterable): A sequence of integers, representing syntactic head offsets.
 | 
					        heads (iterable): A sequence of integers, representing syntactic
 | 
				
			||||||
        deps (iterable): A sequence of strings, representing the syntactic relation types.
 | 
					            head offsets.
 | 
				
			||||||
 | 
					        deps (iterable): A sequence of strings, representing the syntactic
 | 
				
			||||||
 | 
					            relation types.
 | 
				
			||||||
        entities (iterable): A sequence of named entity annotations, either as
 | 
					        entities (iterable): A sequence of named entity annotations, either as
 | 
				
			||||||
            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
 | 
					            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
 | 
				
			||||||
            representing the entity positions.
 | 
					            representing the entity positions.
 | 
				
			||||||
| 
						 | 
					@ -404,9 +411,10 @@ cdef class GoldParse:
 | 
				
			||||||
            document (usually a sentence). Unlike entity annotations, label
 | 
					            document (usually a sentence). Unlike entity annotations, label
 | 
				
			||||||
            annotations can overlap, i.e. a single word can be covered by
 | 
					            annotations can overlap, i.e. a single word can be covered by
 | 
				
			||||||
            multiple labelled spans. The TextCategorizer component expects
 | 
					            multiple labelled spans. The TextCategorizer component expects
 | 
				
			||||||
            true examples of a label to have the value 1.0, and negative examples
 | 
					            true examples of a label to have the value 1.0, and negative
 | 
				
			||||||
            of a label to have the value 0.0. Labels not in the dictionary are
 | 
					            examples of a label to have the value 0.0. Labels not in the
 | 
				
			||||||
            treated as missing -- the gradient for those labels will be zero.
 | 
					            dictionary are treated as missing - the gradient for those labels
 | 
				
			||||||
 | 
					            will be zero.
 | 
				
			||||||
        RETURNS (GoldParse): The newly constructed object.
 | 
					        RETURNS (GoldParse): The newly constructed object.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if words is None:
 | 
					        if words is None:
 | 
				
			||||||
| 
						 | 
					@ -470,11 +478,11 @@ cdef class GoldParse:
 | 
				
			||||||
                self.ner[i] = entities[gold_i]
 | 
					                self.ner[i] = entities[gold_i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cycle = nonproj.contains_cycle(self.heads)
 | 
					        cycle = nonproj.contains_cycle(self.heads)
 | 
				
			||||||
        if cycle != None:
 | 
					        if cycle is not None:
 | 
				
			||||||
            raise Exception("Cycle found: %s" % cycle)
 | 
					            raise Exception("Cycle found: %s" % cycle)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if make_projective:
 | 
					        if make_projective:
 | 
				
			||||||
            proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
 | 
					            proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
 | 
				
			||||||
            self.heads = proj_heads
 | 
					            self.heads = proj_heads
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
| 
						 | 
					@ -497,20 +505,19 @@ cdef class GoldParse:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def biluo_tags_from_offsets(doc, entities, missing='O'):
 | 
					def biluo_tags_from_offsets(doc, entities, missing='O'):
 | 
				
			||||||
    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
 | 
					    """Encode labelled spans into per-token tags, using the
 | 
				
			||||||
    scheme (BILUO).
 | 
					    Begin/In/Last/Unit/Out scheme (BILUO).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    doc (Doc): The document that the entity offsets refer to. The output tags
 | 
					    doc (Doc): The document that the entity offsets refer to. The output tags
 | 
				
			||||||
        will refer to the token boundaries within the document.
 | 
					        will refer to the token boundaries within the document.
 | 
				
			||||||
    entities (iterable): A sequence of `(start, end, label)` triples. `start` and
 | 
					    entities (iterable): A sequence of `(start, end, label)` triples. `start`
 | 
				
			||||||
        `end` should be character-offset integers denoting the slice into the
 | 
					        and `end` should be character-offset integers denoting the slice into
 | 
				
			||||||
        original string.
 | 
					        the original string.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    RETURNS (list): A list of unicode strings, describing the tags. Each tag
 | 
					    RETURNS (list): A list of unicode strings, describing the tags. Each tag
 | 
				
			||||||
        string will be of the form either "", "O" or "{action}-{label}", where
 | 
					        string will be of the form either "", "O" or "{action}-{label}", where
 | 
				
			||||||
        action is one of "B", "I", "L", "U". The string "-" is used where the
 | 
					        action is one of "B", "I", "L", "U". The string "-" is used where the
 | 
				
			||||||
        entity offsets don't align with the tokenization in the `Doc` object. The
 | 
					        entity offsets don't align with the tokenization in the `Doc` object.
 | 
				
			||||||
        training algorithm will view these as missing values. "O" denotes a
 | 
					        The training algorithm will view these as missing values. "O" denotes a
 | 
				
			||||||
        non-entity token. "B" denotes the beginning of a multi-token entity,
 | 
					        non-entity token. "B" denotes the beginning of a multi-token entity,
 | 
				
			||||||
        "I" the inside of an entity of three or more tokens, and "L" the end
 | 
					        "I" the inside of an entity of three or more tokens, and "L" the end
 | 
				
			||||||
        of an entity of two or more tokens. "U" denotes a single-token entity.
 | 
					        of an entity of two or more tokens. "U" denotes a single-token entity.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user