mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'develop' into feature/project-cli
This commit is contained in:
		
						commit
						7a0fe50610
					
				| 
						 | 
					@ -14,7 +14,7 @@ import spacy
 | 
				
			||||||
import spacy.util
 | 
					import spacy.util
 | 
				
			||||||
from bin.ud import conll17_ud_eval
 | 
					from bin.ud import conll17_ud_eval
 | 
				
			||||||
from spacy.tokens import Token, Doc
 | 
					from spacy.tokens import Token, Doc
 | 
				
			||||||
from spacy.gold import GoldParse, Example
 | 
					from spacy.gold import Example
 | 
				
			||||||
from spacy.util import compounding, minibatch, minibatch_by_words
 | 
					from spacy.util import compounding, minibatch, minibatch_by_words
 | 
				
			||||||
from spacy.syntax.nonproj import projectivize
 | 
					from spacy.syntax.nonproj import projectivize
 | 
				
			||||||
from spacy.matcher import Matcher
 | 
					from spacy.matcher import Matcher
 | 
				
			||||||
| 
						 | 
					@ -83,11 +83,11 @@ def read_data(
 | 
				
			||||||
                sent["heads"].append(head)
 | 
					                sent["heads"].append(head)
 | 
				
			||||||
                sent["deps"].append("ROOT" if dep == "root" else dep)
 | 
					                sent["deps"].append("ROOT" if dep == "root" else dep)
 | 
				
			||||||
                sent["spaces"].append(space_after == "_")
 | 
					                sent["spaces"].append(space_after == "_")
 | 
				
			||||||
            sent["entities"] = ["-"] * len(sent["words"])
 | 
					            sent["entities"] = ["-"] * len(sent["words"])    # TODO: doc-level format
 | 
				
			||||||
            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
 | 
					            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
 | 
				
			||||||
            if oracle_segments:
 | 
					            if oracle_segments:
 | 
				
			||||||
                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
 | 
					                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
 | 
				
			||||||
                golds.append(GoldParse(docs[-1], **sent))
 | 
					                golds.append(sent)
 | 
				
			||||||
                assert golds[-1].morphology is not None
 | 
					                assert golds[-1].morphology is not None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            sent_annots.append(sent)
 | 
					            sent_annots.append(sent)
 | 
				
			||||||
| 
						 | 
					@ -151,28 +151,27 @@ def read_conllu(file_):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
 | 
					def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
 | 
				
			||||||
    # Flatten the conll annotations, and adjust the head indices
 | 
					    # Flatten the conll annotations, and adjust the head indices
 | 
				
			||||||
    flat = defaultdict(list)
 | 
					    gold = defaultdict(list)
 | 
				
			||||||
    sent_starts = []
 | 
					    sent_starts = []
 | 
				
			||||||
    for sent in sent_annots:
 | 
					    for sent in sent_annots:
 | 
				
			||||||
        flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
 | 
					        gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
 | 
				
			||||||
        for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
 | 
					        for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
 | 
				
			||||||
            flat[field].extend(sent[field])
 | 
					            gold[field].extend(sent[field])
 | 
				
			||||||
        sent_starts.append(True)
 | 
					        sent_starts.append(True)
 | 
				
			||||||
        sent_starts.extend([False] * (len(sent["words"]) - 1))
 | 
					        sent_starts.extend([False] * (len(sent["words"]) - 1))
 | 
				
			||||||
    # Construct text if necessary
 | 
					    # Construct text if necessary
 | 
				
			||||||
    assert len(flat["words"]) == len(flat["spaces"])
 | 
					    assert len(gold["words"]) == len(gold["spaces"])
 | 
				
			||||||
    if text is None:
 | 
					    if text is None:
 | 
				
			||||||
        text = "".join(
 | 
					        text = "".join(
 | 
				
			||||||
            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
 | 
					            word + " " * space for word, space in zip(gold["words"], gold["spaces"])
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    doc = nlp.make_doc(text)
 | 
					    doc = nlp.make_doc(text)
 | 
				
			||||||
    flat.pop("spaces")
 | 
					    gold.pop("spaces")
 | 
				
			||||||
    gold = GoldParse(doc, **flat)
 | 
					    gold["sent_starts"] = sent_starts
 | 
				
			||||||
    gold.sent_starts = sent_starts
 | 
					 | 
				
			||||||
    for i in range(len(gold.heads)):
 | 
					    for i in range(len(gold.heads)):
 | 
				
			||||||
        if random.random() < drop_deps:
 | 
					        if random.random() < drop_deps:
 | 
				
			||||||
            gold.heads[i] = None
 | 
					            gold["heads"][i] = None
 | 
				
			||||||
            gold.labels[i] = None
 | 
					            gold["labels"][i] = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return doc, gold
 | 
					    return doc, gold
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -183,15 +182,10 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def golds_to_gold_data(docs, golds):
 | 
					def golds_to_gold_data(docs, golds):
 | 
				
			||||||
    """Get out the training data format used by begin_training, given the
 | 
					    """Get out the training data format used by begin_training"""
 | 
				
			||||||
    GoldParse objects."""
 | 
					 | 
				
			||||||
    data = []
 | 
					    data = []
 | 
				
			||||||
    for doc, gold in zip(docs, golds):
 | 
					    for doc, gold in zip(docs, golds):
 | 
				
			||||||
        example = Example(doc=doc)
 | 
					        example = Example.from_dict(doc, gold)
 | 
				
			||||||
        example.add_doc_annotation(cats=gold.cats)
 | 
					 | 
				
			||||||
        token_annotation_dict = gold.orig.to_dict()
 | 
					 | 
				
			||||||
        example.add_token_annotation(**token_annotation_dict)
 | 
					 | 
				
			||||||
        example.goldparse = gold
 | 
					 | 
				
			||||||
        data.append(example)
 | 
					        data.append(example)
 | 
				
			||||||
    return data
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -359,8 +353,8 @@ def initialize_pipeline(nlp, examples, config, device):
 | 
				
			||||||
        nlp.parser.add_multitask_objective("tag")
 | 
					        nlp.parser.add_multitask_objective("tag")
 | 
				
			||||||
    if config.multitask_sent:
 | 
					    if config.multitask_sent:
 | 
				
			||||||
        nlp.parser.add_multitask_objective("sent_start")
 | 
					        nlp.parser.add_multitask_objective("sent_start")
 | 
				
			||||||
    for ex in examples:
 | 
					    for eg in examples:
 | 
				
			||||||
        gold = ex.gold
 | 
					        gold = eg.gold
 | 
				
			||||||
        for tag in gold.tags:
 | 
					        for tag in gold.tags:
 | 
				
			||||||
            if tag is not None:
 | 
					            if tag is not None:
 | 
				
			||||||
                nlp.tagger.add_label(tag)
 | 
					                nlp.tagger.add_label(tag)
 | 
				
			||||||
| 
						 | 
					@ -541,7 +535,7 @@ def main(
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            batches = minibatch(examples, size=batch_sizes)
 | 
					            batches = minibatch(examples, size=batch_sizes)
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        n_train_words = sum(len(ex.doc) for ex in examples)
 | 
					        n_train_words = sum(len(eg.doc) for eg in examples)
 | 
				
			||||||
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
					        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                pbar.update(sum(len(ex.doc) for ex in batch))
 | 
					                pbar.update(sum(len(ex.doc) for ex in batch))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,17 +5,16 @@
 | 
				
			||||||
# data is passed in sentence-by-sentence via some prior preprocessing.
 | 
					# data is passed in sentence-by-sentence via some prior preprocessing.
 | 
				
			||||||
gold_preproc = false
 | 
					gold_preproc = false
 | 
				
			||||||
# Limitations on training document length or number of examples.
 | 
					# Limitations on training document length or number of examples.
 | 
				
			||||||
max_length = 0
 | 
					max_length = 5000
 | 
				
			||||||
limit = 0
 | 
					limit = 0
 | 
				
			||||||
# Data augmentation
 | 
					# Data augmentation
 | 
				
			||||||
orth_variant_level = 0.0
 | 
					orth_variant_level = 0.0
 | 
				
			||||||
noise_level = 0.0
 | 
					 | 
				
			||||||
dropout = 0.1
 | 
					dropout = 0.1
 | 
				
			||||||
# Controls early-stopping. 0 or -1 mean unlimited.
 | 
					# Controls early-stopping. 0 or -1 mean unlimited.
 | 
				
			||||||
patience = 1600
 | 
					patience = 1600
 | 
				
			||||||
max_epochs = 0
 | 
					max_epochs = 0
 | 
				
			||||||
max_steps = 20000
 | 
					max_steps = 20000
 | 
				
			||||||
eval_frequency = 400
 | 
					eval_frequency = 200
 | 
				
			||||||
# Other settings
 | 
					# Other settings
 | 
				
			||||||
seed = 0
 | 
					seed = 0
 | 
				
			||||||
accumulate_gradient = 1
 | 
					accumulate_gradient = 1
 | 
				
			||||||
| 
						 | 
					@ -41,15 +40,15 @@ beta2 = 0.999
 | 
				
			||||||
L2_is_weight_decay = true
 | 
					L2_is_weight_decay = true
 | 
				
			||||||
L2 = 0.01
 | 
					L2 = 0.01
 | 
				
			||||||
grad_clip = 1.0
 | 
					grad_clip = 1.0
 | 
				
			||||||
use_averages = true
 | 
					use_averages = false
 | 
				
			||||||
eps = 1e-8
 | 
					eps = 1e-8
 | 
				
			||||||
learn_rate = 0.001
 | 
					#learn_rate = 0.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[optimizer.learn_rate]
 | 
					[optimizer.learn_rate]
 | 
				
			||||||
#@schedules = "warmup_linear.v1"
 | 
					@schedules = "warmup_linear.v1"
 | 
				
			||||||
#warmup_steps = 250
 | 
					warmup_steps = 250
 | 
				
			||||||
#total_steps = 20000
 | 
					total_steps = 20000
 | 
				
			||||||
#initial_rate = 0.001
 | 
					initial_rate = 0.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
lang = "en"
 | 
					lang = "en"
 | 
				
			||||||
| 
						 | 
					@ -58,15 +57,11 @@ vectors = null
 | 
				
			||||||
[nlp.pipeline.tok2vec]
 | 
					[nlp.pipeline.tok2vec]
 | 
				
			||||||
factory = "tok2vec"
 | 
					factory = "tok2vec"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.senter]
 | 
					 | 
				
			||||||
factory = "senter"
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.ner]
 | 
					[nlp.pipeline.ner]
 | 
				
			||||||
factory = "ner"
 | 
					factory = "ner"
 | 
				
			||||||
learn_tokens = false
 | 
					learn_tokens = false
 | 
				
			||||||
min_action_freq = 1
 | 
					min_action_freq = 1
 | 
				
			||||||
beam_width = 1
 | 
					 | 
				
			||||||
beam_update_prob = 1.0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.tagger]
 | 
					[nlp.pipeline.tagger]
 | 
				
			||||||
factory = "tagger"
 | 
					factory = "tagger"
 | 
				
			||||||
| 
						 | 
					@ -74,16 +69,7 @@ factory = "tagger"
 | 
				
			||||||
[nlp.pipeline.parser]
 | 
					[nlp.pipeline.parser]
 | 
				
			||||||
factory = "parser"
 | 
					factory = "parser"
 | 
				
			||||||
learn_tokens = false
 | 
					learn_tokens = false
 | 
				
			||||||
min_action_freq = 1
 | 
					min_action_freq = 30
 | 
				
			||||||
beam_width = 1
 | 
					 | 
				
			||||||
beam_update_prob = 1.0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.pipeline.senter.model]
 | 
					 | 
				
			||||||
@architectures = "spacy.Tagger.v1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[nlp.pipeline.senter.model.tok2vec]
 | 
					 | 
				
			||||||
@architectures = "spacy.Tok2VecTensors.v1"
 | 
					 | 
				
			||||||
width = ${nlp.pipeline.tok2vec.model:width}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.tagger.model]
 | 
					[nlp.pipeline.tagger.model]
 | 
				
			||||||
@architectures = "spacy.Tagger.v1"
 | 
					@architectures = "spacy.Tagger.v1"
 | 
				
			||||||
| 
						 | 
					@ -96,8 +82,8 @@ width = ${nlp.pipeline.tok2vec.model:width}
 | 
				
			||||||
@architectures = "spacy.TransitionBasedParser.v1"
 | 
					@architectures = "spacy.TransitionBasedParser.v1"
 | 
				
			||||||
nr_feature_tokens = 8
 | 
					nr_feature_tokens = 8
 | 
				
			||||||
hidden_width = 128
 | 
					hidden_width = 128
 | 
				
			||||||
maxout_pieces = 3
 | 
					maxout_pieces = 2
 | 
				
			||||||
use_upper = false
 | 
					use_upper = true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.parser.model.tok2vec]
 | 
					[nlp.pipeline.parser.model.tok2vec]
 | 
				
			||||||
@architectures = "spacy.Tok2VecTensors.v1"
 | 
					@architectures = "spacy.Tok2VecTensors.v1"
 | 
				
			||||||
| 
						 | 
					@ -107,8 +93,8 @@ width = ${nlp.pipeline.tok2vec.model:width}
 | 
				
			||||||
@architectures = "spacy.TransitionBasedParser.v1"
 | 
					@architectures = "spacy.TransitionBasedParser.v1"
 | 
				
			||||||
nr_feature_tokens = 3
 | 
					nr_feature_tokens = 3
 | 
				
			||||||
hidden_width = 128
 | 
					hidden_width = 128
 | 
				
			||||||
maxout_pieces = 3
 | 
					maxout_pieces = 2
 | 
				
			||||||
use_upper = false
 | 
					use_upper = true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[nlp.pipeline.ner.model.tok2vec]
 | 
					[nlp.pipeline.ner.model.tok2vec]
 | 
				
			||||||
@architectures = "spacy.Tok2VecTensors.v1"
 | 
					@architectures = "spacy.Tok2VecTensors.v1"
 | 
				
			||||||
| 
						 | 
					@ -117,10 +103,10 @@ width = ${nlp.pipeline.tok2vec.model:width}
 | 
				
			||||||
[nlp.pipeline.tok2vec.model]
 | 
					[nlp.pipeline.tok2vec.model]
 | 
				
			||||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
					@architectures = "spacy.HashEmbedCNN.v1"
 | 
				
			||||||
pretrained_vectors = ${nlp:vectors}
 | 
					pretrained_vectors = ${nlp:vectors}
 | 
				
			||||||
width = 256
 | 
					width = 128
 | 
				
			||||||
depth = 6
 | 
					depth = 4
 | 
				
			||||||
window_size = 1
 | 
					window_size = 1
 | 
				
			||||||
embed_size = 10000
 | 
					embed_size = 7000
 | 
				
			||||||
maxout_pieces = 3
 | 
					maxout_pieces = 3
 | 
				
			||||||
subword_features = true
 | 
					subword_features = true
 | 
				
			||||||
dropout = null
 | 
					dropout = ${training:dropout}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,6 @@ max_length = 0
 | 
				
			||||||
limit = 0
 | 
					limit = 0
 | 
				
			||||||
# Data augmentation
 | 
					# Data augmentation
 | 
				
			||||||
orth_variant_level = 0.0
 | 
					orth_variant_level = 0.0
 | 
				
			||||||
noise_level = 0.0
 | 
					 | 
				
			||||||
dropout = 0.1
 | 
					dropout = 0.1
 | 
				
			||||||
# Controls early-stopping. 0 or -1 mean unlimited.
 | 
					# Controls early-stopping. 0 or -1 mean unlimited.
 | 
				
			||||||
patience = 1600
 | 
					patience = 1600
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										80
									
								
								examples/experiments/onto-ner.cfg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								examples/experiments/onto-ner.cfg
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,80 @@
 | 
				
			||||||
 | 
					# Training hyper-parameters and additional features.
 | 
				
			||||||
 | 
					[training]
 | 
				
			||||||
 | 
					# Whether to train on sequences with 'gold standard' sentence boundaries
 | 
				
			||||||
 | 
					# and tokens. If you set this to true, take care to ensure your run-time
 | 
				
			||||||
 | 
					# data is passed in sentence-by-sentence via some prior preprocessing.
 | 
				
			||||||
 | 
					gold_preproc = false
 | 
				
			||||||
 | 
					# Limitations on training document length or number of examples.
 | 
				
			||||||
 | 
					max_length = 5000
 | 
				
			||||||
 | 
					limit = 0
 | 
				
			||||||
 | 
					# Data augmentation
 | 
				
			||||||
 | 
					orth_variant_level = 0.0
 | 
				
			||||||
 | 
					dropout = 0.2
 | 
				
			||||||
 | 
					# Controls early-stopping. 0 or -1 mean unlimited.
 | 
				
			||||||
 | 
					patience = 1600
 | 
				
			||||||
 | 
					max_epochs = 0
 | 
				
			||||||
 | 
					max_steps = 20000
 | 
				
			||||||
 | 
					eval_frequency = 500
 | 
				
			||||||
 | 
					# Other settings
 | 
				
			||||||
 | 
					seed = 0
 | 
				
			||||||
 | 
					accumulate_gradient = 1
 | 
				
			||||||
 | 
					use_pytorch_for_gpu_memory = false
 | 
				
			||||||
 | 
					# Control how scores are printed and checkpoints are evaluated.
 | 
				
			||||||
 | 
					scores = ["speed", "ents_p", "ents_r", "ents_f"]
 | 
				
			||||||
 | 
					score_weights = {"ents_f": 1.0}
 | 
				
			||||||
 | 
					# These settings are invalid for the transformer models.
 | 
				
			||||||
 | 
					init_tok2vec = null
 | 
				
			||||||
 | 
					discard_oversize = false
 | 
				
			||||||
 | 
					omit_extra_lookups = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[training.batch_size]
 | 
				
			||||||
 | 
					@schedules = "compounding.v1"
 | 
				
			||||||
 | 
					start = 100
 | 
				
			||||||
 | 
					stop = 1000
 | 
				
			||||||
 | 
					compound = 1.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[training.optimizer]
 | 
				
			||||||
 | 
					@optimizers = "Adam.v1"
 | 
				
			||||||
 | 
					beta1 = 0.9
 | 
				
			||||||
 | 
					beta2 = 0.999
 | 
				
			||||||
 | 
					L2_is_weight_decay = false
 | 
				
			||||||
 | 
					L2 = 1e-6
 | 
				
			||||||
 | 
					grad_clip = 1.0
 | 
				
			||||||
 | 
					use_averages = true
 | 
				
			||||||
 | 
					eps = 1e-8
 | 
				
			||||||
 | 
					learn_rate = 0.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#[optimizer.learn_rate]
 | 
				
			||||||
 | 
					#@schedules = "warmup_linear.v1"
 | 
				
			||||||
 | 
					#warmup_steps = 250
 | 
				
			||||||
 | 
					#total_steps = 20000
 | 
				
			||||||
 | 
					#initial_rate = 0.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "en"
 | 
				
			||||||
 | 
					vectors = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.ner]
 | 
				
			||||||
 | 
					factory = "ner"
 | 
				
			||||||
 | 
					learn_tokens = false
 | 
				
			||||||
 | 
					min_action_freq = 1
 | 
				
			||||||
 | 
					beam_width = 1
 | 
				
			||||||
 | 
					beam_update_prob = 1.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.ner.model]
 | 
				
			||||||
 | 
					@architectures = "spacy.TransitionBasedParser.v1"
 | 
				
			||||||
 | 
					nr_feature_tokens = 3
 | 
				
			||||||
 | 
					hidden_width = 64
 | 
				
			||||||
 | 
					maxout_pieces = 2
 | 
				
			||||||
 | 
					use_upper = true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.ner.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "spacy.HashEmbedCNN.v1"
 | 
				
			||||||
 | 
					pretrained_vectors = ${nlp:vectors}
 | 
				
			||||||
 | 
					width = 96
 | 
				
			||||||
 | 
					depth = 4
 | 
				
			||||||
 | 
					window_size = 1
 | 
				
			||||||
 | 
					embed_size = 2000
 | 
				
			||||||
 | 
					maxout_pieces = 3
 | 
				
			||||||
 | 
					subword_features = true
 | 
				
			||||||
 | 
					dropout = ${training:dropout}
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,6 @@ init_tok2vec = null
 | 
				
			||||||
vectors = null
 | 
					vectors = null
 | 
				
			||||||
max_epochs = 100
 | 
					max_epochs = 100
 | 
				
			||||||
orth_variant_level = 0.0
 | 
					orth_variant_level = 0.0
 | 
				
			||||||
noise_level = 0.0
 | 
					 | 
				
			||||||
gold_preproc = true
 | 
					gold_preproc = true
 | 
				
			||||||
max_length = 0
 | 
					max_length = 0
 | 
				
			||||||
use_gpu = 0
 | 
					use_gpu = 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,6 @@ init_tok2vec = null
 | 
				
			||||||
vectors = null
 | 
					vectors = null
 | 
				
			||||||
max_epochs = 100
 | 
					max_epochs = 100
 | 
				
			||||||
orth_variant_level = 0.0
 | 
					orth_variant_level = 0.0
 | 
				
			||||||
noise_level = 0.0
 | 
					 | 
				
			||||||
gold_preproc = true
 | 
					gold_preproc = true
 | 
				
			||||||
max_length = 0
 | 
					max_length = 0
 | 
				
			||||||
use_gpu = -1
 | 
					use_gpu = -1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,7 +12,7 @@ import tqdm
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
import spacy.util
 | 
					import spacy.util
 | 
				
			||||||
from spacy.tokens import Token, Doc
 | 
					from spacy.tokens import Token, Doc
 | 
				
			||||||
from spacy.gold import GoldParse, Example
 | 
					from spacy.gold import Example
 | 
				
			||||||
from spacy.syntax.nonproj import projectivize
 | 
					from spacy.syntax.nonproj import projectivize
 | 
				
			||||||
from collections import defaultdict
 | 
					from collections import defaultdict
 | 
				
			||||||
from spacy.matcher import Matcher
 | 
					from spacy.matcher import Matcher
 | 
				
			||||||
| 
						 | 
					@ -33,31 +33,6 @@ random.seed(0)
 | 
				
			||||||
numpy.random.seed(0)
 | 
					numpy.random.seed(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def minibatch_by_words(examples, size=5000):
 | 
					 | 
				
			||||||
    random.shuffle(examples)
 | 
					 | 
				
			||||||
    if isinstance(size, int):
 | 
					 | 
				
			||||||
        size_ = itertools.repeat(size)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        size_ = size
 | 
					 | 
				
			||||||
    examples = iter(examples)
 | 
					 | 
				
			||||||
    while True:
 | 
					 | 
				
			||||||
        batch_size = next(size_)
 | 
					 | 
				
			||||||
        batch = []
 | 
					 | 
				
			||||||
        while batch_size >= 0:
 | 
					 | 
				
			||||||
            try:
 | 
					 | 
				
			||||||
                example = next(examples)
 | 
					 | 
				
			||||||
            except StopIteration:
 | 
					 | 
				
			||||||
                if batch:
 | 
					 | 
				
			||||||
                    yield batch
 | 
					 | 
				
			||||||
                return
 | 
					 | 
				
			||||||
            batch_size -= len(example.doc)
 | 
					 | 
				
			||||||
            batch.append(example)
 | 
					 | 
				
			||||||
        if batch:
 | 
					 | 
				
			||||||
            yield batch
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            break
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
################
 | 
					################
 | 
				
			||||||
# Data reading #
 | 
					# Data reading #
 | 
				
			||||||
################
 | 
					################
 | 
				
			||||||
| 
						 | 
					@ -110,7 +85,7 @@ def read_data(
 | 
				
			||||||
            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
 | 
					            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
 | 
				
			||||||
            if oracle_segments:
 | 
					            if oracle_segments:
 | 
				
			||||||
                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
 | 
					                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
 | 
				
			||||||
                golds.append(GoldParse(docs[-1], **sent))
 | 
					                golds.append(sent)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            sent_annots.append(sent)
 | 
					            sent_annots.append(sent)
 | 
				
			||||||
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
 | 
					            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
 | 
				
			||||||
| 
						 | 
					@ -159,20 +134,19 @@ def read_conllu(file_):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _make_gold(nlp, text, sent_annots):
 | 
					def _make_gold(nlp, text, sent_annots):
 | 
				
			||||||
    # Flatten the conll annotations, and adjust the head indices
 | 
					    # Flatten the conll annotations, and adjust the head indices
 | 
				
			||||||
    flat = defaultdict(list)
 | 
					    gold = defaultdict(list)
 | 
				
			||||||
    for sent in sent_annots:
 | 
					    for sent in sent_annots:
 | 
				
			||||||
        flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
 | 
					        gold["heads"].extend(len(gold["words"]) + head for head in sent["heads"])
 | 
				
			||||||
        for field in ["words", "tags", "deps", "entities", "spaces"]:
 | 
					        for field in ["words", "tags", "deps", "entities", "spaces"]:
 | 
				
			||||||
            flat[field].extend(sent[field])
 | 
					            gold[field].extend(sent[field])
 | 
				
			||||||
    # Construct text if necessary
 | 
					    # Construct text if necessary
 | 
				
			||||||
    assert len(flat["words"]) == len(flat["spaces"])
 | 
					    assert len(gold["words"]) == len(gold["spaces"])
 | 
				
			||||||
    if text is None:
 | 
					    if text is None:
 | 
				
			||||||
        text = "".join(
 | 
					        text = "".join(
 | 
				
			||||||
            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
 | 
					            word + " " * space for word, space in zip(gold["words"], gold["spaces"])
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    doc = nlp.make_doc(text)
 | 
					    doc = nlp.make_doc(text)
 | 
				
			||||||
    flat.pop("spaces")
 | 
					    gold.pop("spaces")
 | 
				
			||||||
    gold = GoldParse(doc, **flat)
 | 
					 | 
				
			||||||
    return doc, gold
 | 
					    return doc, gold
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -182,15 +156,10 @@ def _make_gold(nlp, text, sent_annots):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def golds_to_gold_data(docs, golds):
 | 
					def golds_to_gold_data(docs, golds):
 | 
				
			||||||
    """Get out the training data format used by begin_training, given the
 | 
					    """Get out the training data format used by begin_training."""
 | 
				
			||||||
    GoldParse objects."""
 | 
					 | 
				
			||||||
    data = []
 | 
					    data = []
 | 
				
			||||||
    for doc, gold in zip(docs, golds):
 | 
					    for doc, gold in zip(docs, golds):
 | 
				
			||||||
        example = Example(doc=doc)
 | 
					        example = Example.from_dict(doc, gold)
 | 
				
			||||||
        example.add_doc_annotation(cats=gold.cats)
 | 
					 | 
				
			||||||
        token_annotation_dict = gold.orig.to_dict()
 | 
					 | 
				
			||||||
        example.add_token_annotation(**token_annotation_dict)
 | 
					 | 
				
			||||||
        example.goldparse = gold
 | 
					 | 
				
			||||||
        data.append(example)
 | 
					        data.append(example)
 | 
				
			||||||
    return data
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -313,15 +282,15 @@ def initialize_pipeline(nlp, examples, config):
 | 
				
			||||||
        nlp.parser.add_multitask_objective("sent_start")
 | 
					        nlp.parser.add_multitask_objective("sent_start")
 | 
				
			||||||
    nlp.parser.moves.add_action(2, "subtok")
 | 
					    nlp.parser.moves.add_action(2, "subtok")
 | 
				
			||||||
    nlp.add_pipe(nlp.create_pipe("tagger"))
 | 
					    nlp.add_pipe(nlp.create_pipe("tagger"))
 | 
				
			||||||
    for ex in examples:
 | 
					    for eg in examples:
 | 
				
			||||||
        for tag in ex.gold.tags:
 | 
					        for tag in eg.gold.tags:
 | 
				
			||||||
            if tag is not None:
 | 
					            if tag is not None:
 | 
				
			||||||
                nlp.tagger.add_label(tag)
 | 
					                nlp.tagger.add_label(tag)
 | 
				
			||||||
    # Replace labels that didn't make the frequency cutoff
 | 
					    # Replace labels that didn't make the frequency cutoff
 | 
				
			||||||
    actions = set(nlp.parser.labels)
 | 
					    actions = set(nlp.parser.labels)
 | 
				
			||||||
    label_set = set([act.split("-")[1] for act in actions if "-" in act])
 | 
					    label_set = set([act.split("-")[1] for act in actions if "-" in act])
 | 
				
			||||||
    for ex in examples:
 | 
					    for eg in examples:
 | 
				
			||||||
        gold = ex.gold
 | 
					        gold = eg.gold
 | 
				
			||||||
        for i, label in enumerate(gold.labels):
 | 
					        for i, label in enumerate(gold.labels):
 | 
				
			||||||
            if label is not None and label not in label_set:
 | 
					            if label is not None and label not in label_set:
 | 
				
			||||||
                gold.labels[i] = label.split("||")[0]
 | 
					                gold.labels[i] = label.split("||")[0]
 | 
				
			||||||
| 
						 | 
					@ -415,13 +384,12 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
 | 
				
			||||||
    optimizer = initialize_pipeline(nlp, examples, config)
 | 
					    optimizer = initialize_pipeline(nlp, examples, config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for i in range(config.nr_epoch):
 | 
					    for i in range(config.nr_epoch):
 | 
				
			||||||
        docs = [nlp.make_doc(example.doc.text) for example in examples]
 | 
					        batches = spacy.minibatch_by_words(examples, size=config.batch_size)
 | 
				
			||||||
        batches = minibatch_by_words(examples, size=config.batch_size)
 | 
					 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        n_train_words = sum(len(doc) for doc in docs)
 | 
					        n_train_words = sum(len(eg.reference.doc) for eg in examples)
 | 
				
			||||||
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
					        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                pbar.update(sum(len(ex.doc) for ex in batch))
 | 
					                pbar.update(sum(len(eg.reference.doc) for eg in batch))
 | 
				
			||||||
                nlp.update(
 | 
					                nlp.update(
 | 
				
			||||||
                    examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
 | 
					                    examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -30,7 +30,7 @@ ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
 | 
				
			||||||
    model=("Model name, should have pretrained word embeddings", "positional", None, str),
 | 
					    model=("Model name, should have pretrained word embeddings", "positional", None, str),
 | 
				
			||||||
    output_dir=("Optional output directory", "option", "o", Path),
 | 
					    output_dir=("Optional output directory", "option", "o", Path),
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def main(model=None, output_dir=None):
 | 
					def main(model, output_dir=None):
 | 
				
			||||||
    """Load the model and create the KB with pre-defined entity encodings.
 | 
					    """Load the model and create the KB with pre-defined entity encodings.
 | 
				
			||||||
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
 | 
					    If an output_dir is provided, the KB will be stored there in a file 'kb'.
 | 
				
			||||||
    The updated vocab will also be written to a directory in the output_dir."""
 | 
					    The updated vocab will also be written to a directory in the output_dir."""
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -24,8 +24,10 @@ import random
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
import os.path
 | 
					import os.path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.gold.example import Example
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
from spacy.gold import read_json_file, GoldParse
 | 
					from spacy.gold import read_json_file
 | 
				
			||||||
 | 
					
 | 
				
			||||||
random.seed(0)
 | 
					random.seed(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -59,27 +61,25 @@ def main(n_iter=10):
 | 
				
			||||||
    print(nlp.pipeline)
 | 
					    print(nlp.pipeline)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print("Create data", len(TRAIN_DATA))
 | 
					    print("Create data", len(TRAIN_DATA))
 | 
				
			||||||
    optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA)
 | 
					    optimizer = nlp.begin_training()
 | 
				
			||||||
    for itn in range(n_iter):
 | 
					    for itn in range(n_iter):
 | 
				
			||||||
        random.shuffle(TRAIN_DATA)
 | 
					        random.shuffle(TRAIN_DATA)
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        for example in TRAIN_DATA:
 | 
					        for example_dict in TRAIN_DATA:
 | 
				
			||||||
            for token_annotation in example.token_annotations:
 | 
					            doc = Doc(nlp.vocab, words=example_dict["words"])
 | 
				
			||||||
                doc = Doc(nlp.vocab, words=token_annotation.words)
 | 
					            example = Example.from_dict(doc, example_dict)
 | 
				
			||||||
                gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation)
 | 
					            nlp.update(
 | 
				
			||||||
 | 
					                examples=[example],  # 1 example
 | 
				
			||||||
                nlp.update(
 | 
					                drop=0.2,  # dropout - make it harder to memorise data
 | 
				
			||||||
                    examples=[(doc, gold)],  # 1 example
 | 
					                sgd=optimizer,  # callable to update weights
 | 
				
			||||||
                    drop=0.2,  # dropout - make it harder to memorise data
 | 
					                losses=losses,
 | 
				
			||||||
                    sgd=optimizer,  # callable to update weights
 | 
					            )
 | 
				
			||||||
                    losses=losses,
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
        print(losses.get("nn_labeller", 0.0), losses["ner"])
 | 
					        print(losses.get("nn_labeller", 0.0), losses["ner"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # test the trained model
 | 
					    # test the trained model
 | 
				
			||||||
    for example in TRAIN_DATA:
 | 
					    for example_dict in TRAIN_DATA:
 | 
				
			||||||
        if example.text is not None:
 | 
					        if "text" in example_dict:
 | 
				
			||||||
            doc = nlp(example.text)
 | 
					            doc = nlp(example_dict["text"])
 | 
				
			||||||
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
 | 
					            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
 | 
				
			||||||
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 | 
					            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,9 +4,10 @@ import random
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					from spacy.gold import Example
 | 
				
			||||||
from spacy.util import minibatch, compounding
 | 
					from spacy.util import minibatch, compounding
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO: further fix & test this script for v.3 ? (read_gold_data is never called)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
LABEL = "ANIMAL"
 | 
					LABEL = "ANIMAL"
 | 
				
			||||||
TRAIN_DATA = [
 | 
					TRAIN_DATA = [
 | 
				
			||||||
| 
						 | 
					@ -36,15 +37,13 @@ def read_raw_data(nlp, jsonl_loc):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_gold_data(nlp, gold_loc):
 | 
					def read_gold_data(nlp, gold_loc):
 | 
				
			||||||
    docs = []
 | 
					    examples = []
 | 
				
			||||||
    golds = []
 | 
					 | 
				
			||||||
    for json_obj in srsly.read_jsonl(gold_loc):
 | 
					    for json_obj in srsly.read_jsonl(gold_loc):
 | 
				
			||||||
        doc = nlp.make_doc(json_obj["text"])
 | 
					        doc = nlp.make_doc(json_obj["text"])
 | 
				
			||||||
        ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]]
 | 
					        ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]]
 | 
				
			||||||
        gold = GoldParse(doc, entities=ents)
 | 
					        example = Example.from_dict(doc, {"entities": ents})
 | 
				
			||||||
        docs.append(doc)
 | 
					        examples.append(example)
 | 
				
			||||||
        golds.append(gold)
 | 
					    return examples
 | 
				
			||||||
    return list(zip(docs, golds))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def main(model_name, unlabelled_loc):
 | 
					def main(model_name, unlabelled_loc):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,7 +19,7 @@ from ml_datasets import loaders
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
from spacy import util
 | 
					from spacy import util
 | 
				
			||||||
from spacy.util import minibatch, compounding
 | 
					from spacy.util import minibatch, compounding
 | 
				
			||||||
from spacy.gold import Example, GoldParse
 | 
					from spacy.gold import Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					@plac.annotations(
 | 
				
			||||||
| 
						 | 
					@ -62,11 +62,10 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non
 | 
				
			||||||
    train_examples = []
 | 
					    train_examples = []
 | 
				
			||||||
    for text, cats in zip(train_texts, train_cats):
 | 
					    for text, cats in zip(train_texts, train_cats):
 | 
				
			||||||
        doc = nlp.make_doc(text)
 | 
					        doc = nlp.make_doc(text)
 | 
				
			||||||
        gold = GoldParse(doc, cats=cats)
 | 
					        example = Example.from_dict(doc, {"cats": cats})
 | 
				
			||||||
        for cat in cats:
 | 
					        for cat in cats:
 | 
				
			||||||
            textcat.add_label(cat)
 | 
					            textcat.add_label(cat)
 | 
				
			||||||
        ex = Example.from_gold(gold, doc=doc)
 | 
					        train_examples.append(example)
 | 
				
			||||||
        train_examples.append(ex)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with nlp.select_pipes(enable="textcat"):  # only train textcat
 | 
					    with nlp.select_pipes(enable="textcat"):  # only train textcat
 | 
				
			||||||
        optimizer = nlp.begin_training()
 | 
					        optimizer = nlp.begin_training()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										7
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										7
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -23,6 +23,8 @@ Options.docstrings = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
PACKAGES = find_packages()
 | 
					PACKAGES = find_packages()
 | 
				
			||||||
MOD_NAMES = [
 | 
					MOD_NAMES = [
 | 
				
			||||||
 | 
					    "spacy.gold.align",
 | 
				
			||||||
 | 
					    "spacy.gold.example",
 | 
				
			||||||
    "spacy.parts_of_speech",
 | 
					    "spacy.parts_of_speech",
 | 
				
			||||||
    "spacy.strings",
 | 
					    "spacy.strings",
 | 
				
			||||||
    "spacy.lexeme",
 | 
					    "spacy.lexeme",
 | 
				
			||||||
| 
						 | 
					@ -37,11 +39,10 @@ MOD_NAMES = [
 | 
				
			||||||
    "spacy.tokenizer",
 | 
					    "spacy.tokenizer",
 | 
				
			||||||
    "spacy.syntax.nn_parser",
 | 
					    "spacy.syntax.nn_parser",
 | 
				
			||||||
    "spacy.syntax._parser_model",
 | 
					    "spacy.syntax._parser_model",
 | 
				
			||||||
    "spacy.syntax._beam_utils",
 | 
					 | 
				
			||||||
    "spacy.syntax.nonproj",
 | 
					    "spacy.syntax.nonproj",
 | 
				
			||||||
    "spacy.syntax.transition_system",
 | 
					    "spacy.syntax.transition_system",
 | 
				
			||||||
    "spacy.syntax.arc_eager",
 | 
					    "spacy.syntax.arc_eager",
 | 
				
			||||||
    "spacy.gold",
 | 
					    "spacy.gold.gold_io",
 | 
				
			||||||
    "spacy.tokens.doc",
 | 
					    "spacy.tokens.doc",
 | 
				
			||||||
    "spacy.tokens.span",
 | 
					    "spacy.tokens.span",
 | 
				
			||||||
    "spacy.tokens.token",
 | 
					    "spacy.tokens.token",
 | 
				
			||||||
| 
						 | 
					@ -120,7 +121,7 @@ class build_ext_subclass(build_ext, build_ext_options):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def clean(path):
 | 
					def clean(path):
 | 
				
			||||||
    for path in path.glob("**/*"):
 | 
					    for path in path.glob("**/*"):
 | 
				
			||||||
        if path.is_file() and path.suffix in (".so", ".cpp"):
 | 
					        if path.is_file() and path.suffix in (".so", ".cpp", ".html"):
 | 
				
			||||||
            print(f"Deleting {path.name}")
 | 
					            print(f"Deleting {path.name}")
 | 
				
			||||||
            path.unlink()
 | 
					            path.unlink()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
__title__ = "spacy"
 | 
					__title__ = "spacy"
 | 
				
			||||||
__version__ = "3.0.0.dev9"
 | 
					__version__ = "3.0.0"
 | 
				
			||||||
__release__ = True
 | 
					__release__ = True
 | 
				
			||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
					__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
				
			||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
					__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,7 +8,7 @@ from .download import download  # noqa: F401
 | 
				
			||||||
from .info import info  # noqa: F401
 | 
					from .info import info  # noqa: F401
 | 
				
			||||||
from .package import package  # noqa: F401
 | 
					from .package import package  # noqa: F401
 | 
				
			||||||
from .profile import profile  # noqa: F401
 | 
					from .profile import profile  # noqa: F401
 | 
				
			||||||
from .train_from_config import train  # noqa: F401
 | 
					from .train import train_cli  # noqa: F401
 | 
				
			||||||
from .pretrain import pretrain  # noqa: F401
 | 
					from .pretrain import pretrain  # noqa: F401
 | 
				
			||||||
from .debug_data import debug_data  # noqa: F401
 | 
					from .debug_data import debug_data  # noqa: F401
 | 
				
			||||||
from .evaluate import evaluate  # noqa: F401
 | 
					from .evaluate import evaluate  # noqa: F401
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,53 +4,56 @@ from pathlib import Path
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._app import app, Arg, Opt
 | 
					from ._app import app, Arg, Opt
 | 
				
			||||||
from .converters import conllu2json, iob2json, conll_ner2json
 | 
					from ..gold import docs_to_json
 | 
				
			||||||
from .converters import ner_jsonl2json
 | 
					from ..tokens import DocBin
 | 
				
			||||||
 | 
					from ..gold.converters import iob2docs, conll_ner2docs, json2docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Converters are matched by file extension except for ner/iob, which are
 | 
					# Converters are matched by file extension except for ner/iob, which are
 | 
				
			||||||
# matched by file extension and content. To add a converter, add a new
 | 
					# matched by file extension and content. To add a converter, add a new
 | 
				
			||||||
# entry to this dict with the file extension mapped to the converter function
 | 
					# entry to this dict with the file extension mapped to the converter function
 | 
				
			||||||
# imported from /converters.
 | 
					# imported from /converters.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CONVERTERS = {
 | 
					CONVERTERS = {
 | 
				
			||||||
    "conllubio": conllu2json,
 | 
					    # "conllubio": conllu2docs, TODO
 | 
				
			||||||
    "conllu": conllu2json,
 | 
					    # "conllu": conllu2docs, TODO
 | 
				
			||||||
    "conll": conllu2json,
 | 
					    # "conll": conllu2docs, TODO
 | 
				
			||||||
    "ner": conll_ner2json,
 | 
					    "ner": conll_ner2docs,
 | 
				
			||||||
    "iob": iob2json,
 | 
					    "iob": iob2docs,
 | 
				
			||||||
    "jsonl": ner_jsonl2json,
 | 
					    "json": json2docs,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# File types
 | 
					
 | 
				
			||||||
FILE_TYPES_STDOUT = ("json", "jsonl")
 | 
					# File types that can be written to stdout
 | 
				
			||||||
 | 
					FILE_TYPES_STDOUT = ("json")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class FileTypes(str, Enum):
 | 
					class FileTypes(str, Enum):
 | 
				
			||||||
    json = "json"
 | 
					    json = "json"
 | 
				
			||||||
    jsonl = "jsonl"
 | 
					    spacy = "spacy"
 | 
				
			||||||
    msg = "msg"
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("convert")
 | 
					@app.command("convert")
 | 
				
			||||||
def convert_cli(
 | 
					def convert_cli(
 | 
				
			||||||
    # fmt: off
 | 
					    # fmt: off
 | 
				
			||||||
    input_file: str = Arg(..., help="Input file", exists=True),
 | 
					    input_path: str = Arg(..., help="Input file or directory", exists=True),
 | 
				
			||||||
    output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
 | 
					    output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
 | 
				
			||||||
    file_type: FileTypes = Opt(FileTypes.json.value, "--file-type", "-t", help="Type of data to produce"),
 | 
					    file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
 | 
				
			||||||
    n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
 | 
					    n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
 | 
				
			||||||
    seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
 | 
					    seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
 | 
				
			||||||
    model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
 | 
					    model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
 | 
				
			||||||
    morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
 | 
					    morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
 | 
				
			||||||
    merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
 | 
					    merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
 | 
				
			||||||
    converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
 | 
					    converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
 | 
				
			||||||
    ner_map_path: Optional[Path] = Opt(None, "--ner-map-path", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
 | 
					    ner_map: Optional[Path] = Opt(None, "--ner-map", "-N", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
 | 
				
			||||||
    lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
 | 
					    lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Convert files into JSON format for use with train command and other
 | 
					    Convert files into json or DocBin format for use with train command and other
 | 
				
			||||||
    experiment management functions. If no output_dir is specified, the data
 | 
					    experiment management functions. If no output_dir is specified, the data
 | 
				
			||||||
    is written to stdout, so you can pipe them forward to a JSON file:
 | 
					    is written to stdout, so you can pipe them forward to a JSON file:
 | 
				
			||||||
    $ spacy convert some_file.conllu > some_file.json
 | 
					    $ spacy convert some_file.conllu > some_file.json
 | 
				
			||||||
| 
						 | 
					@ -58,9 +61,15 @@ def convert_cli(
 | 
				
			||||||
    if isinstance(file_type, FileTypes):
 | 
					    if isinstance(file_type, FileTypes):
 | 
				
			||||||
        # We get an instance of the FileTypes from the CLI so we need its string value
 | 
					        # We get an instance of the FileTypes from the CLI so we need its string value
 | 
				
			||||||
        file_type = file_type.value
 | 
					        file_type = file_type.value
 | 
				
			||||||
 | 
					    input_path = Path(input_path)
 | 
				
			||||||
 | 
					    output_dir = "-" if output_dir == Path("-") else output_dir
 | 
				
			||||||
 | 
					    cli_args = locals()
 | 
				
			||||||
    silent = output_dir == "-"
 | 
					    silent = output_dir == "-"
 | 
				
			||||||
 | 
					    msg = Printer(no_print=silent)
 | 
				
			||||||
 | 
					    verify_cli_args(msg, **cli_args)
 | 
				
			||||||
 | 
					    converter = _get_converter(msg, converter, input_path)
 | 
				
			||||||
    convert(
 | 
					    convert(
 | 
				
			||||||
        input_file,
 | 
					        input_path,
 | 
				
			||||||
        output_dir,
 | 
					        output_dir,
 | 
				
			||||||
        file_type=file_type,
 | 
					        file_type=file_type,
 | 
				
			||||||
        n_sents=n_sents,
 | 
					        n_sents=n_sents,
 | 
				
			||||||
| 
						 | 
					@ -69,91 +78,77 @@ def convert_cli(
 | 
				
			||||||
        morphology=morphology,
 | 
					        morphology=morphology,
 | 
				
			||||||
        merge_subtokens=merge_subtokens,
 | 
					        merge_subtokens=merge_subtokens,
 | 
				
			||||||
        converter=converter,
 | 
					        converter=converter,
 | 
				
			||||||
        ner_map_path=ner_map_path,
 | 
					        ner_map=ner_map,
 | 
				
			||||||
        lang=lang,
 | 
					        lang=lang,
 | 
				
			||||||
        silent=silent,
 | 
					        silent=silent,
 | 
				
			||||||
 | 
					        msg=msg,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert(
 | 
					def convert(
 | 
				
			||||||
    input_file: Path,
 | 
					        input_path: Path,
 | 
				
			||||||
    output_dir: Path,
 | 
					        output_dir: Path,
 | 
				
			||||||
    *,
 | 
					        *,
 | 
				
			||||||
    file_type: str = "json",
 | 
					        file_type: str = "json",
 | 
				
			||||||
    n_sents: int = 1,
 | 
					        n_sents: int = 1,
 | 
				
			||||||
    seg_sents: bool = False,
 | 
					        seg_sents: bool = False,
 | 
				
			||||||
    model: Optional[str] = None,
 | 
					        model: Optional[str] = None,
 | 
				
			||||||
    morphology: bool = False,
 | 
					        morphology: bool = False,
 | 
				
			||||||
    merge_subtokens: bool = False,
 | 
					        merge_subtokens: bool = False,
 | 
				
			||||||
    converter: str = "auto",
 | 
					        converter: str = "auto",
 | 
				
			||||||
    ner_map_path: Optional[Path] = None,
 | 
					        ner_map: Optional[Path] = None,
 | 
				
			||||||
    lang: Optional[str] = None,
 | 
					        lang: Optional[str] = None,
 | 
				
			||||||
    silent: bool = True,
 | 
					        silent: bool = True,
 | 
				
			||||||
 | 
					        msg: Optional[Path] = None,
 | 
				
			||||||
) -> None:
 | 
					) -> None:
 | 
				
			||||||
    msg = Printer(no_print=silent, pretty=not silent)
 | 
					    if not msg:
 | 
				
			||||||
    input_path = Path(input_file)
 | 
					        msg = Printer(no_print=silent)
 | 
				
			||||||
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
 | 
					    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
 | 
				
			||||||
        # TODO: support msgpack via stdout in srsly?
 | 
					
 | 
				
			||||||
        msg.fail(
 | 
					    for input_loc in walk_directory(input_path):
 | 
				
			||||||
            f"Can't write .{file_type} data to stdout",
 | 
					        input_data = input_loc.open("r", encoding="utf-8").read()
 | 
				
			||||||
            "Please specify an output directory.",
 | 
					        # Use converter function to convert data
 | 
				
			||||||
            exits=1,
 | 
					        func = CONVERTERS[converter]
 | 
				
			||||||
 | 
					        docs = func(
 | 
				
			||||||
 | 
					            input_data,
 | 
				
			||||||
 | 
					            n_sents=n_sents,
 | 
				
			||||||
 | 
					            seg_sents=seg_sents,
 | 
				
			||||||
 | 
					            append_morphology=morphology,
 | 
				
			||||||
 | 
					            merge_subtokens=merge_subtokens,
 | 
				
			||||||
 | 
					            lang=lang,
 | 
				
			||||||
 | 
					            model=model,
 | 
				
			||||||
 | 
					            no_print=silent,
 | 
				
			||||||
 | 
					            ner_map=ner_map,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    if not input_path.exists():
 | 
					        if output_dir == "-":
 | 
				
			||||||
        msg.fail("Input file not found", input_path, exits=1)
 | 
					            _print_docs_to_stdout(docs, file_type)
 | 
				
			||||||
    if output_dir != "-" and not Path(output_dir).exists():
 | 
					 | 
				
			||||||
        msg.fail("Output directory not found", output_dir, exits=1)
 | 
					 | 
				
			||||||
    input_data = input_path.open("r", encoding="utf-8").read()
 | 
					 | 
				
			||||||
    if converter == "auto":
 | 
					 | 
				
			||||||
        converter = input_path.suffix[1:]
 | 
					 | 
				
			||||||
    if converter == "ner" or converter == "iob":
 | 
					 | 
				
			||||||
        converter_autodetect = autodetect_ner_format(input_data)
 | 
					 | 
				
			||||||
        if converter_autodetect == "ner":
 | 
					 | 
				
			||||||
            msg.info("Auto-detected token-per-line NER format")
 | 
					 | 
				
			||||||
            converter = converter_autodetect
 | 
					 | 
				
			||||||
        elif converter_autodetect == "iob":
 | 
					 | 
				
			||||||
            msg.info("Auto-detected sentence-per-line NER format")
 | 
					 | 
				
			||||||
            converter = converter_autodetect
 | 
					 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            msg.warn(
 | 
					            if input_loc != input_path:
 | 
				
			||||||
                "Can't automatically detect NER format. Conversion may not "
 | 
					                subpath = input_loc.relative_to(input_path)
 | 
				
			||||||
                "succeed. See https://spacy.io/api/cli#convert"
 | 
					                output_file = Path(output_dir) / subpath.with_suffix(f".{file_type}")
 | 
				
			||||||
            )
 | 
					            else:
 | 
				
			||||||
    if converter not in CONVERTERS:
 | 
					                output_file = Path(output_dir) / input_loc.parts[-1]
 | 
				
			||||||
        msg.fail(f"Can't find converter for {converter}", exits=1)
 | 
					                output_file = output_file.with_suffix(f".{file_type}")
 | 
				
			||||||
    ner_map = None
 | 
					            _write_docs_to_file(docs, output_file, file_type)
 | 
				
			||||||
    if ner_map_path is not None:
 | 
					            msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
 | 
				
			||||||
        ner_map = srsly.read_json(ner_map_path)
 | 
					
 | 
				
			||||||
    # Use converter function to convert data
 | 
					
 | 
				
			||||||
    func = CONVERTERS[converter]
 | 
					def _print_docs_to_stdout(docs, output_type):
 | 
				
			||||||
    data = func(
 | 
					    if output_type == "json":
 | 
				
			||||||
        input_data,
 | 
					        srsly.write_json("-", docs_to_json(docs))
 | 
				
			||||||
        n_sents=n_sents,
 | 
					 | 
				
			||||||
        seg_sents=seg_sents,
 | 
					 | 
				
			||||||
        append_morphology=morphology,
 | 
					 | 
				
			||||||
        merge_subtokens=merge_subtokens,
 | 
					 | 
				
			||||||
        lang=lang,
 | 
					 | 
				
			||||||
        model=model,
 | 
					 | 
				
			||||||
        no_print=silent,
 | 
					 | 
				
			||||||
        ner_map=ner_map,
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    if output_dir != "-":
 | 
					 | 
				
			||||||
        # Export data to a file
 | 
					 | 
				
			||||||
        suffix = f".{file_type}"
 | 
					 | 
				
			||||||
        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
 | 
					 | 
				
			||||||
        if file_type == "json":
 | 
					 | 
				
			||||||
            srsly.write_json(output_file, data)
 | 
					 | 
				
			||||||
        elif file_type == "jsonl":
 | 
					 | 
				
			||||||
            srsly.write_jsonl(output_file, data)
 | 
					 | 
				
			||||||
        elif file_type == "msg":
 | 
					 | 
				
			||||||
            srsly.write_msgpack(output_file, data)
 | 
					 | 
				
			||||||
        msg.good(f"Generated output file ({len(data)} documents): {output_file}")
 | 
					 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        # Print to stdout
 | 
					        sys.stdout.buffer.write(DocBin(docs=docs).to_bytes())
 | 
				
			||||||
        if file_type == "json":
 | 
					
 | 
				
			||||||
            srsly.write_json("-", data)
 | 
					
 | 
				
			||||||
        elif file_type == "jsonl":
 | 
					def _write_docs_to_file(docs, output_file, output_type):
 | 
				
			||||||
            srsly.write_jsonl("-", data)
 | 
					    if not output_file.parent.exists():
 | 
				
			||||||
 | 
					        output_file.parent.mkdir(parents=True)
 | 
				
			||||||
 | 
					    if output_type == "json":
 | 
				
			||||||
 | 
					        srsly.write_json(output_file, docs_to_json(docs))
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        data = DocBin(docs=docs).to_bytes()
 | 
				
			||||||
 | 
					        with output_file.open("wb") as file_:
 | 
				
			||||||
 | 
					            file_.write(data)
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def autodetect_ner_format(input_data: str) -> str:
 | 
					def autodetect_ner_format(input_data: str) -> str:
 | 
				
			||||||
| 
						 | 
					@ -173,3 +168,86 @@ def autodetect_ner_format(input_data: str) -> str:
 | 
				
			||||||
    if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
 | 
					    if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
 | 
				
			||||||
        return "iob"
 | 
					        return "iob"
 | 
				
			||||||
    return None
 | 
					    return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def walk_directory(path):
 | 
				
			||||||
 | 
					    if not path.is_dir():
 | 
				
			||||||
 | 
					        return [path]
 | 
				
			||||||
 | 
					    paths = [path]
 | 
				
			||||||
 | 
					    locs = []
 | 
				
			||||||
 | 
					    seen = set()
 | 
				
			||||||
 | 
					    for path in paths:
 | 
				
			||||||
 | 
					        if str(path) in seen:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        seen.add(str(path))
 | 
				
			||||||
 | 
					        if path.parts[-1].startswith("."):
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        elif path.is_dir():
 | 
				
			||||||
 | 
					            paths.extend(path.iterdir())
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            locs.append(path)
 | 
				
			||||||
 | 
					    return locs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def verify_cli_args(
 | 
				
			||||||
 | 
					    msg,
 | 
				
			||||||
 | 
					    input_path,
 | 
				
			||||||
 | 
					    output_dir,
 | 
				
			||||||
 | 
					    file_type,
 | 
				
			||||||
 | 
					    n_sents,
 | 
				
			||||||
 | 
					    seg_sents,
 | 
				
			||||||
 | 
					    model,
 | 
				
			||||||
 | 
					    morphology,
 | 
				
			||||||
 | 
					    merge_subtokens,
 | 
				
			||||||
 | 
					    converter,
 | 
				
			||||||
 | 
					    ner_map,
 | 
				
			||||||
 | 
					    lang,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    input_path = Path(input_path)
 | 
				
			||||||
 | 
					    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
 | 
				
			||||||
 | 
					        # TODO: support msgpack via stdout in srsly?
 | 
				
			||||||
 | 
					        msg.fail(
 | 
				
			||||||
 | 
					            f"Can't write .{file_type} data to stdout",
 | 
				
			||||||
 | 
					            "Please specify an output directory.",
 | 
				
			||||||
 | 
					            exits=1,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    if not input_path.exists():
 | 
				
			||||||
 | 
					        msg.fail("Input file not found", input_path, exits=1)
 | 
				
			||||||
 | 
					    if output_dir != "-" and not Path(output_dir).exists():
 | 
				
			||||||
 | 
					        msg.fail("Output directory not found", output_dir, exits=1)
 | 
				
			||||||
 | 
					    if input_path.is_dir():
 | 
				
			||||||
 | 
					        input_locs = walk_directory(input_path)
 | 
				
			||||||
 | 
					        if len(input_locs) == 0:
 | 
				
			||||||
 | 
					            msg.fail("No input files in directory", input_path, exits=1)
 | 
				
			||||||
 | 
					        file_types = list(set([loc.suffix[1:] for loc in input_locs]))
 | 
				
			||||||
 | 
					        if len(file_types) >= 2:
 | 
				
			||||||
 | 
					            file_types = ",".join(file_types)
 | 
				
			||||||
 | 
					            msg.fail("All input files must be same type", file_types, exits=1)
 | 
				
			||||||
 | 
					    converter = _get_converter(msg, converter, input_path)
 | 
				
			||||||
 | 
					    if converter not in CONVERTERS:
 | 
				
			||||||
 | 
					        msg.fail(f"Can't find converter for {converter}", exits=1)
 | 
				
			||||||
 | 
					    return converter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _get_converter(msg, converter, input_path):
 | 
				
			||||||
 | 
					    if input_path.is_dir():
 | 
				
			||||||
 | 
					        input_path = walk_directory(input_path)[0]
 | 
				
			||||||
 | 
					    if converter == "auto":
 | 
				
			||||||
 | 
					        converter = input_path.suffix[1:]
 | 
				
			||||||
 | 
					    if converter == "ner" or converter == "iob":
 | 
				
			||||||
 | 
					        with input_path.open() as file_:
 | 
				
			||||||
 | 
					            input_data = file_.read()
 | 
				
			||||||
 | 
					        converter_autodetect = autodetect_ner_format(input_data)
 | 
				
			||||||
 | 
					        if converter_autodetect == "ner":
 | 
				
			||||||
 | 
					            msg.info("Auto-detected token-per-line NER format")
 | 
				
			||||||
 | 
					            converter = converter_autodetect
 | 
				
			||||||
 | 
					        elif converter_autodetect == "iob":
 | 
				
			||||||
 | 
					            msg.info("Auto-detected sentence-per-line NER format")
 | 
				
			||||||
 | 
					            converter = converter_autodetect
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            msg.warn(
 | 
				
			||||||
 | 
					                "Can't automatically detect NER format. "
 | 
				
			||||||
 | 
					                "Conversion may not succeed. "
 | 
				
			||||||
 | 
					                "See https://spacy.io/api/cli#convert"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					    return converter
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +0,0 @@
 | 
				
			||||||
from .conllu2json import conllu2json  # noqa: F401
 | 
					 | 
				
			||||||
from .iob2json import iob2json  # noqa: F401
 | 
					 | 
				
			||||||
from .conll_ner2json import conll_ner2json  # noqa: F401
 | 
					 | 
				
			||||||
from .jsonl2json import ner_jsonl2json  # noqa: F401
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,65 +0,0 @@
 | 
				
			||||||
from wasabi import Printer
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...gold import iob_to_biluo
 | 
					 | 
				
			||||||
from ...util import minibatch
 | 
					 | 
				
			||||||
from .conll_ner2json import n_sents_info
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    Convert IOB files with one sentence per line and tags separated with '|'
 | 
					 | 
				
			||||||
    into JSON format for use with train cli. IOB and IOB2 are accepted.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Sample formats:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
 | 
					 | 
				
			||||||
    I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
 | 
					 | 
				
			||||||
    I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
 | 
					 | 
				
			||||||
    I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    msg = Printer(no_print=no_print)
 | 
					 | 
				
			||||||
    docs = read_iob(input_data.split("\n"))
 | 
					 | 
				
			||||||
    if n_sents > 0:
 | 
					 | 
				
			||||||
        n_sents_info(msg, n_sents)
 | 
					 | 
				
			||||||
        docs = merge_sentences(docs, n_sents)
 | 
					 | 
				
			||||||
    return docs
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_iob(raw_sents):
 | 
					 | 
				
			||||||
    sentences = []
 | 
					 | 
				
			||||||
    for line in raw_sents:
 | 
					 | 
				
			||||||
        if not line.strip():
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
        tokens = [t.split("|") for t in line.split()]
 | 
					 | 
				
			||||||
        if len(tokens[0]) == 3:
 | 
					 | 
				
			||||||
            words, pos, iob = zip(*tokens)
 | 
					 | 
				
			||||||
        elif len(tokens[0]) == 2:
 | 
					 | 
				
			||||||
            words, iob = zip(*tokens)
 | 
					 | 
				
			||||||
            pos = ["-"] * len(words)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            raise ValueError(
 | 
					 | 
				
			||||||
                "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        biluo = iob_to_biluo(iob)
 | 
					 | 
				
			||||||
        sentences.append(
 | 
					 | 
				
			||||||
            [
 | 
					 | 
				
			||||||
                {"orth": w, "tag": p, "ner": ent}
 | 
					 | 
				
			||||||
                for (w, p, ent) in zip(words, pos, biluo)
 | 
					 | 
				
			||||||
            ]
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    sentences = [{"tokens": sent} for sent in sentences]
 | 
					 | 
				
			||||||
    paragraphs = [{"sentences": [sent]} for sent in sentences]
 | 
					 | 
				
			||||||
    docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
 | 
					 | 
				
			||||||
    return docs
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def merge_sentences(docs, n_sents):
 | 
					 | 
				
			||||||
    merged = []
 | 
					 | 
				
			||||||
    for group in minibatch(docs, size=n_sents):
 | 
					 | 
				
			||||||
        group = list(group)
 | 
					 | 
				
			||||||
        first = group.pop(0)
 | 
					 | 
				
			||||||
        to_extend = first["paragraphs"][0]["sentences"]
 | 
					 | 
				
			||||||
        for sent in group:
 | 
					 | 
				
			||||||
            to_extend.extend(sent["paragraphs"][0]["sentences"])
 | 
					 | 
				
			||||||
        merged.append(first)
 | 
					 | 
				
			||||||
    return merged
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,50 +0,0 @@
 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...gold import docs_to_json
 | 
					 | 
				
			||||||
from ...util import get_lang_class, minibatch
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
 | 
					 | 
				
			||||||
    if lang is None:
 | 
					 | 
				
			||||||
        raise ValueError("No --lang specified, but tokenization required")
 | 
					 | 
				
			||||||
    json_docs = []
 | 
					 | 
				
			||||||
    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
 | 
					 | 
				
			||||||
    nlp = get_lang_class(lang)()
 | 
					 | 
				
			||||||
    sentencizer = nlp.create_pipe("sentencizer")
 | 
					 | 
				
			||||||
    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
 | 
					 | 
				
			||||||
        docs = []
 | 
					 | 
				
			||||||
        for record in batch:
 | 
					 | 
				
			||||||
            raw_text = record["text"]
 | 
					 | 
				
			||||||
            if "entities" in record:
 | 
					 | 
				
			||||||
                ents = record["entities"]
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                ents = record["spans"]
 | 
					 | 
				
			||||||
            ents = [(e["start"], e["end"], e["label"]) for e in ents]
 | 
					 | 
				
			||||||
            doc = nlp.make_doc(raw_text)
 | 
					 | 
				
			||||||
            sentencizer(doc)
 | 
					 | 
				
			||||||
            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
 | 
					 | 
				
			||||||
            doc.ents = _cleanup_spans(spans)
 | 
					 | 
				
			||||||
            docs.append(doc)
 | 
					 | 
				
			||||||
        json_docs.append(docs_to_json(docs, id=i))
 | 
					 | 
				
			||||||
    return json_docs
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _cleanup_spans(spans):
 | 
					 | 
				
			||||||
    output = []
 | 
					 | 
				
			||||||
    seen = set()
 | 
					 | 
				
			||||||
    for span in spans:
 | 
					 | 
				
			||||||
        if span is not None:
 | 
					 | 
				
			||||||
            # Trim whitespace
 | 
					 | 
				
			||||||
            while len(span) and span[0].is_space:
 | 
					 | 
				
			||||||
                span = span[1:]
 | 
					 | 
				
			||||||
            while len(span) and span[-1].is_space:
 | 
					 | 
				
			||||||
                span = span[:-1]
 | 
					 | 
				
			||||||
            if not len(span):
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            for i in range(span.start, span.end):
 | 
					 | 
				
			||||||
                if i in seen:
 | 
					 | 
				
			||||||
                    break
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                output.append(span)
 | 
					 | 
				
			||||||
                seen.update(range(span.start, span.end))
 | 
					 | 
				
			||||||
    return output
 | 
					 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,7 @@ import srsly
 | 
				
			||||||
from wasabi import Printer, MESSAGES
 | 
					from wasabi import Printer, MESSAGES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._app import app, Arg, Opt
 | 
					from ._app import app, Arg, Opt
 | 
				
			||||||
from ..gold import GoldCorpus, Example
 | 
					from ..gold import Corpus, Example
 | 
				
			||||||
from ..syntax import nonproj
 | 
					from ..syntax import nonproj
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..util import load_model, get_lang_class
 | 
					from ..util import load_model, get_lang_class
 | 
				
			||||||
| 
						 | 
					@ -99,7 +99,7 @@ def debug_data(
 | 
				
			||||||
    loading_train_error_message = ""
 | 
					    loading_train_error_message = ""
 | 
				
			||||||
    loading_dev_error_message = ""
 | 
					    loading_dev_error_message = ""
 | 
				
			||||||
    with msg.loading("Loading corpus..."):
 | 
					    with msg.loading("Loading corpus..."):
 | 
				
			||||||
        corpus = GoldCorpus(train_path, dev_path)
 | 
					        corpus = Corpus(train_path, dev_path)
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            train_dataset = list(corpus.train_dataset(nlp))
 | 
					            train_dataset = list(corpus.train_dataset(nlp))
 | 
				
			||||||
            train_dataset_unpreprocessed = list(
 | 
					            train_dataset_unpreprocessed = list(
 | 
				
			||||||
| 
						 | 
					@ -518,12 +518,12 @@ def _compile_gold(
 | 
				
			||||||
        "texts": set(),
 | 
					        "texts": set(),
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    for example in examples:
 | 
					    for example in examples:
 | 
				
			||||||
        gold = example.gold
 | 
					        gold = example.reference
 | 
				
			||||||
        doc = example.doc
 | 
					        doc = example.predicted
 | 
				
			||||||
        valid_words = [x for x in gold.words if x is not None]
 | 
					        valid_words = [x for x in gold if x is not None]
 | 
				
			||||||
        data["words"].update(valid_words)
 | 
					        data["words"].update(valid_words)
 | 
				
			||||||
        data["n_words"] += len(valid_words)
 | 
					        data["n_words"] += len(valid_words)
 | 
				
			||||||
        data["n_misaligned_words"] += len(gold.words) - len(valid_words)
 | 
					        data["n_misaligned_words"] += len(gold) - len(valid_words)
 | 
				
			||||||
        data["texts"].add(doc.text)
 | 
					        data["texts"].add(doc.text)
 | 
				
			||||||
        if len(nlp.vocab.vectors):
 | 
					        if len(nlp.vocab.vectors):
 | 
				
			||||||
            for word in valid_words:
 | 
					            for word in valid_words:
 | 
				
			||||||
| 
						 | 
					@ -578,10 +578,10 @@ def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
 | 
					def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
 | 
				
			||||||
    count = 0
 | 
					    count = 0
 | 
				
			||||||
    for ex in data:
 | 
					    for eg in data:
 | 
				
			||||||
        labels = [
 | 
					        labels = [
 | 
				
			||||||
            label.split("-")[1]
 | 
					            label.split("-")[1]
 | 
				
			||||||
            for label in ex.gold.ner
 | 
					            for label in eg.gold.ner
 | 
				
			||||||
            if label not in ("O", "-", None)
 | 
					            if label not in ("O", "-", None)
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
        if label not in labels:
 | 
					        if label not in labels:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,10 +3,10 @@ from timeit import default_timer as timer
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._app import app, Arg, Opt
 | 
					from ..gold import Corpus
 | 
				
			||||||
from ..tokens import Doc
 | 
					from ..tokens import Doc
 | 
				
			||||||
 | 
					from ._app import app, Arg, Opt
 | 
				
			||||||
from ..scorer import Scorer
 | 
					from ..scorer import Scorer
 | 
				
			||||||
from ..gold import GoldCorpus
 | 
					 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from .. import displacy
 | 
					from .. import displacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,7 +20,9 @@ def evaluate_cli(
 | 
				
			||||||
    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
 | 
					    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
 | 
				
			||||||
    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
 | 
					    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
 | 
				
			||||||
    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
 | 
					    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
 | 
				
			||||||
    # fmt: on
 | 
					    return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"),
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Evaluate a model. To render a sample of parses in a HTML file, set an
 | 
					    Evaluate a model. To render a sample of parses in a HTML file, set an
 | 
				
			||||||
| 
						 | 
					@ -34,6 +36,7 @@ def evaluate_cli(
 | 
				
			||||||
        displacy_path=displacy_path,
 | 
					        displacy_path=displacy_path,
 | 
				
			||||||
        displacy_limit=displacy_limit,
 | 
					        displacy_limit=displacy_limit,
 | 
				
			||||||
        silent=False,
 | 
					        silent=False,
 | 
				
			||||||
 | 
					        return_scores=return_scores,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -45,6 +48,7 @@ def evaluate(
 | 
				
			||||||
    displacy_path: Optional[Path] = None,
 | 
					    displacy_path: Optional[Path] = None,
 | 
				
			||||||
    displacy_limit: int = 25,
 | 
					    displacy_limit: int = 25,
 | 
				
			||||||
    silent: bool = True,
 | 
					    silent: bool = True,
 | 
				
			||||||
 | 
					    return_scores: bool = False,
 | 
				
			||||||
) -> Scorer:
 | 
					) -> Scorer:
 | 
				
			||||||
    msg = Printer(no_print=silent, pretty=not silent)
 | 
					    msg = Printer(no_print=silent, pretty=not silent)
 | 
				
			||||||
    util.fix_random_seed()
 | 
					    util.fix_random_seed()
 | 
				
			||||||
| 
						 | 
					@ -57,7 +61,7 @@ def evaluate(
 | 
				
			||||||
        msg.fail("Evaluation data not found", data_path, exits=1)
 | 
					        msg.fail("Evaluation data not found", data_path, exits=1)
 | 
				
			||||||
    if displacy_path and not displacy_path.exists():
 | 
					    if displacy_path and not displacy_path.exists():
 | 
				
			||||||
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
 | 
					        msg.fail("Visualization output directory not found", displacy_path, exits=1)
 | 
				
			||||||
    corpus = GoldCorpus(data_path, data_path)
 | 
					    corpus = Corpus(data_path, data_path)
 | 
				
			||||||
    if model.startswith("blank:"):
 | 
					    if model.startswith("blank:"):
 | 
				
			||||||
        nlp = util.get_lang_class(model.replace("blank:", ""))()
 | 
					        nlp = util.get_lang_class(model.replace("blank:", ""))()
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
| 
						 | 
					@ -101,7 +105,8 @@ def evaluate(
 | 
				
			||||||
            ents=render_ents,
 | 
					            ents=render_ents,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
 | 
					        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
 | 
				
			||||||
    return scorer.scores
 | 
					    if return_scores:
 | 
				
			||||||
 | 
					        return scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def render_parses(
 | 
					def render_parses(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -179,8 +179,7 @@ def pretrain(
 | 
				
			||||||
    skip_counter = 0
 | 
					    skip_counter = 0
 | 
				
			||||||
    loss_func = pretrain_config["loss_func"]
 | 
					    loss_func = pretrain_config["loss_func"]
 | 
				
			||||||
    for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
 | 
					    for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
 | 
				
			||||||
        examples = [Example(doc=text) for text in texts]
 | 
					        batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"])
 | 
				
			||||||
        batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"])
 | 
					 | 
				
			||||||
        for batch_id, batch in enumerate(batches):
 | 
					        for batch_id, batch in enumerate(batches):
 | 
				
			||||||
            docs, count = make_docs(
 | 
					            docs, count = make_docs(
 | 
				
			||||||
                nlp,
 | 
					                nlp,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,16 +1,18 @@
 | 
				
			||||||
from typing import Optional, Dict
 | 
					from typing import Optional, Dict, List, Union, Sequence
 | 
				
			||||||
from timeit import default_timer as timer
 | 
					from timeit import default_timer as timer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import tqdm
 | 
					import tqdm
 | 
				
			||||||
 | 
					from pydantic import BaseModel, FilePath
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
import thinc
 | 
					import thinc
 | 
				
			||||||
import thinc.schedules
 | 
					import thinc.schedules
 | 
				
			||||||
from thinc.api import use_pytorch_for_gpu_memory
 | 
					from thinc.api import Model, use_pytorch_for_gpu_memory
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._app import app, Arg, Opt
 | 
					from ._app import app, Arg, Opt
 | 
				
			||||||
from ..gold import GoldCorpus
 | 
					from ..gold import Corpus
 | 
				
			||||||
from ..lookups import Lookups
 | 
					from ..lookups import Lookups
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
| 
						 | 
					@ -82,6 +84,41 @@ subword_features = true
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class PipelineComponent(BaseModel):
 | 
				
			||||||
 | 
					    factory: str
 | 
				
			||||||
 | 
					    model: Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class Config:
 | 
				
			||||||
 | 
					        arbitrary_types_allowed = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ConfigSchema(BaseModel):
 | 
				
			||||||
 | 
					    optimizer: Optional["Optimizer"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class training(BaseModel):
 | 
				
			||||||
 | 
					        patience: int = 10
 | 
				
			||||||
 | 
					        eval_frequency: int = 100
 | 
				
			||||||
 | 
					        dropout: float = 0.2
 | 
				
			||||||
 | 
					        init_tok2vec: Optional[FilePath] = None
 | 
				
			||||||
 | 
					        max_epochs: int = 100
 | 
				
			||||||
 | 
					        orth_variant_level: float = 0.0
 | 
				
			||||||
 | 
					        gold_preproc: bool = False
 | 
				
			||||||
 | 
					        max_length: int = 0
 | 
				
			||||||
 | 
					        use_gpu: int = 0
 | 
				
			||||||
 | 
					        scores: List[str] = ["ents_p", "ents_r", "ents_f"]
 | 
				
			||||||
 | 
					        score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
 | 
				
			||||||
 | 
					        limit: int = 0
 | 
				
			||||||
 | 
					        batch_size: Union[Sequence[int], int]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class nlp(BaseModel):
 | 
				
			||||||
 | 
					        lang: str
 | 
				
			||||||
 | 
					        vectors: Optional[str]
 | 
				
			||||||
 | 
					        pipeline: Optional[Dict[str, PipelineComponent]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class Config:
 | 
				
			||||||
 | 
					        extra = "allow"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("train")
 | 
					@app.command("train")
 | 
				
			||||||
def train_cli(
 | 
					def train_cli(
 | 
				
			||||||
    # fmt: off
 | 
					    # fmt: off
 | 
				
			||||||
| 
						 | 
					@ -104,33 +141,8 @@ def train_cli(
 | 
				
			||||||
    command.
 | 
					    command.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    util.set_env_log(verbose)
 | 
					    util.set_env_log(verbose)
 | 
				
			||||||
 | 
					    verify_cli_args(**locals())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Make sure all files and paths exists if they are needed
 | 
					 | 
				
			||||||
    if not config_path or not config_path.exists():
 | 
					 | 
				
			||||||
        msg.fail("Config file not found", config_path, exits=1)
 | 
					 | 
				
			||||||
    if not train_path or not train_path.exists():
 | 
					 | 
				
			||||||
        msg.fail("Training data not found", train_path, exits=1)
 | 
					 | 
				
			||||||
    if not dev_path or not dev_path.exists():
 | 
					 | 
				
			||||||
        msg.fail("Development data not found", dev_path, exits=1)
 | 
					 | 
				
			||||||
    if output_path is not None:
 | 
					 | 
				
			||||||
        if not output_path.exists():
 | 
					 | 
				
			||||||
            output_path.mkdir()
 | 
					 | 
				
			||||||
            msg.good(f"Created output directory: {output_path}")
 | 
					 | 
				
			||||||
        elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
 | 
					 | 
				
			||||||
            msg.warn(
 | 
					 | 
				
			||||||
                "Output directory is not empty.",
 | 
					 | 
				
			||||||
                "This can lead to unintended side effects when saving the model. "
 | 
					 | 
				
			||||||
                "Please use an empty directory or a different path instead. If "
 | 
					 | 
				
			||||||
                "the specified output path doesn't exist, the directory will be "
 | 
					 | 
				
			||||||
                "created for you.",
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
    if code_path is not None:
 | 
					 | 
				
			||||||
        if not code_path.exists():
 | 
					 | 
				
			||||||
            msg.fail("Path to Python code not found", code_path, exits=1)
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            util.import_file("python_code", code_path)
 | 
					 | 
				
			||||||
        except Exception as e:
 | 
					 | 
				
			||||||
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 | 
					 | 
				
			||||||
    if raw_text is not None:
 | 
					    if raw_text is not None:
 | 
				
			||||||
        raw_text = list(srsly.read_jsonl(raw_text))
 | 
					        raw_text = list(srsly.read_jsonl(raw_text))
 | 
				
			||||||
    tag_map = {}
 | 
					    tag_map = {}
 | 
				
			||||||
| 
						 | 
					@ -139,8 +151,6 @@ def train_cli(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    weights_data = None
 | 
					    weights_data = None
 | 
				
			||||||
    if init_tok2vec is not None:
 | 
					    if init_tok2vec is not None:
 | 
				
			||||||
        if not init_tok2vec.exists():
 | 
					 | 
				
			||||||
            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
 | 
					 | 
				
			||||||
        with init_tok2vec.open("rb") as file_:
 | 
					        with init_tok2vec.open("rb") as file_:
 | 
				
			||||||
            weights_data = file_.read()
 | 
					            weights_data = file_.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -184,71 +194,20 @@ def train(
 | 
				
			||||||
    nlp = util.load_model_from_config(nlp_config)
 | 
					    nlp = util.load_model_from_config(nlp_config)
 | 
				
			||||||
    optimizer = training["optimizer"]
 | 
					    optimizer = training["optimizer"]
 | 
				
			||||||
    limit = training["limit"]
 | 
					    limit = training["limit"]
 | 
				
			||||||
    msg.info("Loading training corpus")
 | 
					    corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
 | 
				
			||||||
    corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # verify textcat config
 | 
					 | 
				
			||||||
    if "textcat" in nlp_config["pipeline"]:
 | 
					    if "textcat" in nlp_config["pipeline"]:
 | 
				
			||||||
        textcat_labels = set(nlp.get_pipe("textcat").labels)
 | 
					        verify_textcat_config(nlp, nlp_config)
 | 
				
			||||||
        textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][
 | 
					 | 
				
			||||||
            "exclusive_classes"
 | 
					 | 
				
			||||||
        ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # check whether the setting 'exclusive_classes' corresponds to the provided training data
 | 
					 | 
				
			||||||
        if textcat_multilabel:
 | 
					 | 
				
			||||||
            multilabel_found = False
 | 
					 | 
				
			||||||
            for ex in corpus.train_examples:
 | 
					 | 
				
			||||||
                cats = ex.doc_annotation.cats
 | 
					 | 
				
			||||||
                textcat_labels.update(cats.keys())
 | 
					 | 
				
			||||||
                if list(cats.values()).count(1.0) != 1:
 | 
					 | 
				
			||||||
                    multilabel_found = True
 | 
					 | 
				
			||||||
            if not multilabel_found:
 | 
					 | 
				
			||||||
                msg.warn(
 | 
					 | 
				
			||||||
                    "The textcat training instances look like they have "
 | 
					 | 
				
			||||||
                    "mutually exclusive classes. Set 'exclusive_classes' "
 | 
					 | 
				
			||||||
                    "to 'true' in the config to train a classifier with "
 | 
					 | 
				
			||||||
                    "mutually exclusive classes more accurately."
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            for ex in corpus.train_examples:
 | 
					 | 
				
			||||||
                cats = ex.doc_annotation.cats
 | 
					 | 
				
			||||||
                textcat_labels.update(cats.keys())
 | 
					 | 
				
			||||||
                if list(cats.values()).count(1.0) != 1:
 | 
					 | 
				
			||||||
                    msg.fail(
 | 
					 | 
				
			||||||
                        "Some textcat training instances do not have exactly "
 | 
					 | 
				
			||||||
                        "one positive label. Set 'exclusive_classes' "
 | 
					 | 
				
			||||||
                        "to 'false' in the config to train a classifier with classes "
 | 
					 | 
				
			||||||
                        "that are not mutually exclusive."
 | 
					 | 
				
			||||||
                    )
 | 
					 | 
				
			||||||
        msg.info(
 | 
					 | 
				
			||||||
            f"Initialized textcat component for {len(textcat_labels)} unique labels"
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        nlp.get_pipe("textcat").labels = tuple(textcat_labels)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # if 'positive_label' is provided: double check whether it's in the data and the task is binary
 | 
					 | 
				
			||||||
        if nlp_config["pipeline"]["textcat"].get("positive_label", None):
 | 
					 | 
				
			||||||
            textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
 | 
					 | 
				
			||||||
            pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
 | 
					 | 
				
			||||||
            if pos_label not in textcat_labels:
 | 
					 | 
				
			||||||
                msg.fail(
 | 
					 | 
				
			||||||
                    f"The textcat's 'positive_label' config setting '{pos_label}' "
 | 
					 | 
				
			||||||
                    f"does not match any label in the training data.",
 | 
					 | 
				
			||||||
                    exits=1,
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            if len(textcat_labels) != 2:
 | 
					 | 
				
			||||||
                msg.fail(
 | 
					 | 
				
			||||||
                    f"A textcat 'positive_label' '{pos_label}' was "
 | 
					 | 
				
			||||||
                    f"provided for training data that does not appear to be a "
 | 
					 | 
				
			||||||
                    f"binary classification problem with two labels.",
 | 
					 | 
				
			||||||
                    exits=1,
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if training.get("resume", False):
 | 
					    if training.get("resume", False):
 | 
				
			||||||
        msg.info("Resuming training")
 | 
					        msg.info("Resuming training")
 | 
				
			||||||
        nlp.resume_training()
 | 
					        nlp.resume_training()
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
 | 
					        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
 | 
				
			||||||
        nlp.begin_training(lambda: corpus.train_examples)
 | 
					        train_examples = list(corpus.train_dataset(
 | 
				
			||||||
 | 
					            nlp,
 | 
				
			||||||
 | 
					            shuffle=False,
 | 
				
			||||||
 | 
					            gold_preproc=training["gold_preproc"]
 | 
				
			||||||
 | 
					        ))
 | 
				
			||||||
 | 
					        nlp.begin_training(lambda: train_examples)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Update tag map with provided mapping
 | 
					    # Update tag map with provided mapping
 | 
				
			||||||
    nlp.vocab.morphology.tag_map.update(tag_map)
 | 
					    nlp.vocab.morphology.tag_map.update(tag_map)
 | 
				
			||||||
| 
						 | 
					@ -279,6 +238,7 @@ def train(
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        tok2vec.from_bytes(weights_data)
 | 
					        tok2vec.from_bytes(weights_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    msg.info("Loading training corpus")
 | 
				
			||||||
    train_batches = create_train_batches(nlp, corpus, training)
 | 
					    train_batches = create_train_batches(nlp, corpus, training)
 | 
				
			||||||
    evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
 | 
					    evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -311,18 +271,15 @@ def train(
 | 
				
			||||||
                    update_meta(training, nlp, info)
 | 
					                    update_meta(training, nlp, info)
 | 
				
			||||||
                    nlp.to_disk(output_path / "model-best")
 | 
					                    nlp.to_disk(output_path / "model-best")
 | 
				
			||||||
                progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
 | 
					                progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
 | 
				
			||||||
            # Clean up the objects to faciliate garbage collection.
 | 
					 | 
				
			||||||
            for eg in batch:
 | 
					 | 
				
			||||||
                eg.doc = None
 | 
					 | 
				
			||||||
                eg.goldparse = None
 | 
					 | 
				
			||||||
                eg.doc_annotation = None
 | 
					 | 
				
			||||||
                eg.token_annotation = None
 | 
					 | 
				
			||||||
    except Exception as e:
 | 
					    except Exception as e:
 | 
				
			||||||
        msg.warn(
 | 
					        if output_path is not None:
 | 
				
			||||||
            f"Aborting and saving the final best model. "
 | 
					            msg.warn(
 | 
				
			||||||
            f"Encountered exception: {str(e)}",
 | 
					                f"Aborting and saving the final best model. "
 | 
				
			||||||
            exits=1,
 | 
					                f"Encountered exception: {str(e)}",
 | 
				
			||||||
        )
 | 
					                exits=1,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            raise e
 | 
				
			||||||
    finally:
 | 
					    finally:
 | 
				
			||||||
        if output_path is not None:
 | 
					        if output_path is not None:
 | 
				
			||||||
            final_model_path = output_path / "model-final"
 | 
					            final_model_path = output_path / "model-final"
 | 
				
			||||||
| 
						 | 
					@ -335,21 +292,19 @@ def train(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def create_train_batches(nlp, corpus, cfg):
 | 
					def create_train_batches(nlp, corpus, cfg):
 | 
				
			||||||
    epochs_todo = cfg.get("max_epochs", 0)
 | 
					    max_epochs = cfg.get("max_epochs", 0)
 | 
				
			||||||
 | 
					    train_examples = list(corpus.train_dataset(
 | 
				
			||||||
 | 
					        nlp,
 | 
				
			||||||
 | 
					        shuffle=True,
 | 
				
			||||||
 | 
					        gold_preproc=cfg["gold_preproc"],
 | 
				
			||||||
 | 
					        max_length=cfg["max_length"]
 | 
				
			||||||
 | 
					    ))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    epoch = 0
 | 
				
			||||||
    while True:
 | 
					    while True:
 | 
				
			||||||
        train_examples = list(
 | 
					 | 
				
			||||||
            corpus.train_dataset(
 | 
					 | 
				
			||||||
                nlp,
 | 
					 | 
				
			||||||
                noise_level=0.0,  # I think this is deprecated?
 | 
					 | 
				
			||||||
                orth_variant_level=cfg["orth_variant_level"],
 | 
					 | 
				
			||||||
                gold_preproc=cfg["gold_preproc"],
 | 
					 | 
				
			||||||
                max_length=cfg["max_length"],
 | 
					 | 
				
			||||||
                ignore_misaligned=True,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        if len(train_examples) == 0:
 | 
					        if len(train_examples) == 0:
 | 
				
			||||||
            raise ValueError(Errors.E988)
 | 
					            raise ValueError(Errors.E988)
 | 
				
			||||||
        random.shuffle(train_examples)
 | 
					        epoch += 1
 | 
				
			||||||
        batches = util.minibatch_by_words(
 | 
					        batches = util.minibatch_by_words(
 | 
				
			||||||
            train_examples,
 | 
					            train_examples,
 | 
				
			||||||
            size=cfg["batch_size"],
 | 
					            size=cfg["batch_size"],
 | 
				
			||||||
| 
						 | 
					@ -358,15 +313,12 @@ def create_train_batches(nlp, corpus, cfg):
 | 
				
			||||||
        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
 | 
					        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            first = next(batches)
 | 
					            first = next(batches)
 | 
				
			||||||
            yield first
 | 
					            yield epoch, first
 | 
				
			||||||
        except StopIteration:
 | 
					        except StopIteration:
 | 
				
			||||||
            raise ValueError(Errors.E986)
 | 
					            raise ValueError(Errors.E986)
 | 
				
			||||||
        for batch in batches:
 | 
					        for batch in batches:
 | 
				
			||||||
            yield batch
 | 
					            yield epoch, batch
 | 
				
			||||||
        epochs_todo -= 1
 | 
					        if max_epochs >= 1 and epoch >= max_epochs:
 | 
				
			||||||
        # We intentionally compare exactly to 0 here, so that max_epochs < 1
 | 
					 | 
				
			||||||
        # will not break.
 | 
					 | 
				
			||||||
        if epochs_todo == 0:
 | 
					 | 
				
			||||||
            break
 | 
					            break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -377,7 +329,8 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 | 
				
			||||||
                nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
 | 
					                nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        n_words = sum(len(ex.doc) for ex in dev_examples)
 | 
					
 | 
				
			||||||
 | 
					        n_words = sum(len(ex.predicted) for ex in dev_examples)
 | 
				
			||||||
        start_time = timer()
 | 
					        start_time = timer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if optimizer.averages:
 | 
					        if optimizer.averages:
 | 
				
			||||||
| 
						 | 
					@ -395,7 +348,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 | 
				
			||||||
        except KeyError as e:
 | 
					        except KeyError as e:
 | 
				
			||||||
            raise KeyError(
 | 
					            raise KeyError(
 | 
				
			||||||
                Errors.E983.format(
 | 
					                Errors.E983.format(
 | 
				
			||||||
                    dict_name="score_weights", key=str(e), keys=list(scores.keys())
 | 
					                    dict="score_weights", key=str(e), keys=list(scores.keys())
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -438,7 +391,7 @@ def train_while_improving(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Every iteration, the function yields out a tuple with:
 | 
					    Every iteration, the function yields out a tuple with:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    * batch: A zipped sequence of Tuple[Doc, GoldParse] pairs.
 | 
					    * batch: A list of Example objects.
 | 
				
			||||||
    * info: A dict with various information about the last update (see below).
 | 
					    * info: A dict with various information about the last update (see below).
 | 
				
			||||||
    * is_best_checkpoint: A value in None, False, True, indicating whether this
 | 
					    * is_best_checkpoint: A value in None, False, True, indicating whether this
 | 
				
			||||||
        was the best evaluation so far. You should use this to save the model
 | 
					        was the best evaluation so far. You should use this to save the model
 | 
				
			||||||
| 
						 | 
					@ -470,7 +423,7 @@ def train_while_improving(
 | 
				
			||||||
            (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
 | 
					            (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for step, batch in enumerate(train_data):
 | 
					    for step, (epoch, batch) in enumerate(train_data):
 | 
				
			||||||
        dropout = next(dropouts)
 | 
					        dropout = next(dropouts)
 | 
				
			||||||
        with nlp.select_pipes(enable=to_enable):
 | 
					        with nlp.select_pipes(enable=to_enable):
 | 
				
			||||||
            for subbatch in subdivide_batch(batch, accumulate_gradient):
 | 
					            for subbatch in subdivide_batch(batch, accumulate_gradient):
 | 
				
			||||||
| 
						 | 
					@ -492,6 +445,7 @@ def train_while_improving(
 | 
				
			||||||
            score, other_scores = (None, None)
 | 
					            score, other_scores = (None, None)
 | 
				
			||||||
            is_best_checkpoint = None
 | 
					            is_best_checkpoint = None
 | 
				
			||||||
        info = {
 | 
					        info = {
 | 
				
			||||||
 | 
					            "epoch": epoch,
 | 
				
			||||||
            "step": step,
 | 
					            "step": step,
 | 
				
			||||||
            "score": score,
 | 
					            "score": score,
 | 
				
			||||||
            "other_scores": other_scores,
 | 
					            "other_scores": other_scores,
 | 
				
			||||||
| 
						 | 
					@ -512,7 +466,7 @@ def train_while_improving(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def subdivide_batch(batch, accumulate_gradient):
 | 
					def subdivide_batch(batch, accumulate_gradient):
 | 
				
			||||||
    batch = list(batch)
 | 
					    batch = list(batch)
 | 
				
			||||||
    batch.sort(key=lambda eg: len(eg.doc))
 | 
					    batch.sort(key=lambda eg: len(eg.predicted))
 | 
				
			||||||
    sub_len = len(batch) // accumulate_gradient
 | 
					    sub_len = len(batch) // accumulate_gradient
 | 
				
			||||||
    start = 0
 | 
					    start = 0
 | 
				
			||||||
    for i in range(accumulate_gradient):
 | 
					    for i in range(accumulate_gradient):
 | 
				
			||||||
| 
						 | 
					@ -530,9 +484,9 @@ def setup_printer(training, nlp):
 | 
				
			||||||
    score_widths = [max(len(col), 6) for col in score_cols]
 | 
					    score_widths = [max(len(col), 6) for col in score_cols]
 | 
				
			||||||
    loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
 | 
					    loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
 | 
				
			||||||
    loss_widths = [max(len(col), 8) for col in loss_cols]
 | 
					    loss_widths = [max(len(col), 8) for col in loss_cols]
 | 
				
			||||||
    table_header = ["#"] + loss_cols + score_cols + ["Score"]
 | 
					    table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
 | 
				
			||||||
    table_header = [col.upper() for col in table_header]
 | 
					    table_header = [col.upper() for col in table_header]
 | 
				
			||||||
    table_widths = [6] + loss_widths + score_widths + [6]
 | 
					    table_widths = [3, 6] + loss_widths + score_widths + [6]
 | 
				
			||||||
    table_aligns = ["r" for _ in table_widths]
 | 
					    table_aligns = ["r" for _ in table_widths]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.row(table_header, widths=table_widths)
 | 
					    msg.row(table_header, widths=table_widths)
 | 
				
			||||||
| 
						 | 
					@ -547,9 +501,7 @@ def setup_printer(training, nlp):
 | 
				
			||||||
        except KeyError as e:
 | 
					        except KeyError as e:
 | 
				
			||||||
            raise KeyError(
 | 
					            raise KeyError(
 | 
				
			||||||
                Errors.E983.format(
 | 
					                Errors.E983.format(
 | 
				
			||||||
                    dict_name="scores (losses)",
 | 
					                    dict="scores (losses)", key=str(e), keys=list(info["losses"].keys())
 | 
				
			||||||
                    key=str(e),
 | 
					 | 
				
			||||||
                    keys=list(info["losses"].keys()),
 | 
					 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -560,13 +512,13 @@ def setup_printer(training, nlp):
 | 
				
			||||||
        except KeyError as e:
 | 
					        except KeyError as e:
 | 
				
			||||||
            raise KeyError(
 | 
					            raise KeyError(
 | 
				
			||||||
                Errors.E983.format(
 | 
					                Errors.E983.format(
 | 
				
			||||||
                    dict_name="scores (other)",
 | 
					                    dict="scores (other)",
 | 
				
			||||||
                    key=str(e),
 | 
					                    key=str(e),
 | 
				
			||||||
                    keys=list(info["other_scores"].keys()),
 | 
					                    keys=list(info["other_scores"].keys()),
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        data = (
 | 
					        data = (
 | 
				
			||||||
            [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
 | 
					            [info["epoch"], info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        msg.row(data, widths=table_widths, aligns=table_aligns)
 | 
					        msg.row(data, widths=table_widths, aligns=table_aligns)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -580,3 +532,67 @@ def update_meta(training, nlp, info):
 | 
				
			||||||
        nlp.meta["performance"][metric] = info["other_scores"][metric]
 | 
					        nlp.meta["performance"][metric] = info["other_scores"][metric]
 | 
				
			||||||
    for pipe_name in nlp.pipe_names:
 | 
					    for pipe_name in nlp.pipe_names:
 | 
				
			||||||
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 | 
					        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def verify_cli_args(
 | 
				
			||||||
 | 
					    train_path,
 | 
				
			||||||
 | 
					    dev_path,
 | 
				
			||||||
 | 
					    config_path,
 | 
				
			||||||
 | 
					    output_path=None,
 | 
				
			||||||
 | 
					    code_path=None,
 | 
				
			||||||
 | 
					    init_tok2vec=None,
 | 
				
			||||||
 | 
					    raw_text=None,
 | 
				
			||||||
 | 
					    verbose=False,
 | 
				
			||||||
 | 
					    use_gpu=-1,
 | 
				
			||||||
 | 
					    tag_map_path=None,
 | 
				
			||||||
 | 
					    omit_extra_lookups=False,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    # Make sure all files and paths exists if they are needed
 | 
				
			||||||
 | 
					    if not config_path or not config_path.exists():
 | 
				
			||||||
 | 
					        msg.fail("Config file not found", config_path, exits=1)
 | 
				
			||||||
 | 
					    if not train_path or not train_path.exists():
 | 
				
			||||||
 | 
					        msg.fail("Training data not found", train_path, exits=1)
 | 
				
			||||||
 | 
					    if not dev_path or not dev_path.exists():
 | 
				
			||||||
 | 
					        msg.fail("Development data not found", dev_path, exits=1)
 | 
				
			||||||
 | 
					    if output_path is not None:
 | 
				
			||||||
 | 
					        if not output_path.exists():
 | 
				
			||||||
 | 
					            output_path.mkdir()
 | 
				
			||||||
 | 
					            msg.good(f"Created output directory: {output_path}")
 | 
				
			||||||
 | 
					        elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
 | 
				
			||||||
 | 
					            msg.warn(
 | 
				
			||||||
 | 
					                "Output directory is not empty.",
 | 
				
			||||||
 | 
					                "This can lead to unintended side effects when saving the model. "
 | 
				
			||||||
 | 
					                "Please use an empty directory or a different path instead. If "
 | 
				
			||||||
 | 
					                "the specified output path doesn't exist, the directory will be "
 | 
				
			||||||
 | 
					                "created for you.",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					    if code_path is not None:
 | 
				
			||||||
 | 
					        if not code_path.exists():
 | 
				
			||||||
 | 
					            msg.fail("Path to Python code not found", code_path, exits=1)
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            util.import_file("python_code", code_path)
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 | 
				
			||||||
 | 
					    if init_tok2vec is not None and not init_tok2vec.exists():
 | 
				
			||||||
 | 
					        msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def verify_textcat_config(nlp, nlp_config):
 | 
				
			||||||
 | 
					    # if 'positive_label' is provided: double check whether it's in the data and
 | 
				
			||||||
 | 
					    # the task is binary
 | 
				
			||||||
 | 
					    if nlp_config["pipeline"]["textcat"].get("positive_label", None):
 | 
				
			||||||
 | 
					        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
 | 
				
			||||||
 | 
					        pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
 | 
				
			||||||
 | 
					        if pos_label not in textcat_labels:
 | 
				
			||||||
 | 
					            msg.fail(
 | 
				
			||||||
 | 
					                f"The textcat's 'positive_label' config setting '{pos_label}' "
 | 
				
			||||||
 | 
					                f"does not match any label in the training data.",
 | 
				
			||||||
 | 
					                exits=1,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        if len(textcat_labels) != 2:
 | 
				
			||||||
 | 
					            msg.fail(
 | 
				
			||||||
 | 
					                f"A textcat 'positive_label' '{pos_label}' was "
 | 
				
			||||||
 | 
					                f"provided for training data that does not appear to be a "
 | 
				
			||||||
 | 
					                f"binary classification problem with two labels.",
 | 
				
			||||||
 | 
					                exits=1,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
| 
						 | 
					@ -132,6 +132,8 @@ class Warnings(object):
 | 
				
			||||||
            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
 | 
					            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
 | 
					    W093 = ("Could not find any data to train the {name} on. Is your "
 | 
				
			||||||
 | 
					            "input data correctly formatted ?")
 | 
				
			||||||
    W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
 | 
					    W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
 | 
				
			||||||
            "spaCy version requirement: {version}. This can lead to compatibility "
 | 
					            "spaCy version requirement: {version}. This can lead to compatibility "
 | 
				
			||||||
            "problems with older versions, or as new spaCy versions are "
 | 
					            "problems with older versions, or as new spaCy versions are "
 | 
				
			||||||
| 
						 | 
					@ -575,9 +577,6 @@ class Errors(object):
 | 
				
			||||||
            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
 | 
					            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
 | 
				
			||||||
    E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
 | 
					    E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
 | 
				
			||||||
    E187 = ("Only unicode strings are supported as labels.")
 | 
					    E187 = ("Only unicode strings are supported as labels.")
 | 
				
			||||||
    E188 = ("Could not match the gold entity links to entities in the doc - "
 | 
					 | 
				
			||||||
            "make sure the gold EL data refers to valid results of the "
 | 
					 | 
				
			||||||
            "named entity recognizer in the `nlp` pipeline.")
 | 
					 | 
				
			||||||
    E189 = ("Each argument to `get_doc` should be of equal length.")
 | 
					    E189 = ("Each argument to `get_doc` should be of equal length.")
 | 
				
			||||||
    E190 = ("Token head out of range in `Doc.from_array()` for token index "
 | 
					    E190 = ("Token head out of range in `Doc.from_array()` for token index "
 | 
				
			||||||
            "'{index}' with value '{value}' (equivalent to relative head "
 | 
					            "'{index}' with value '{value}' (equivalent to relative head "
 | 
				
			||||||
| 
						 | 
					@ -602,10 +601,17 @@ class Errors(object):
 | 
				
			||||||
            "can not be combined with adding a pretrained Tok2Vec layer.")
 | 
					            "can not be combined with adding a pretrained Tok2Vec layer.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
    E983 = ("Invalid key for '{dict_name}': {key}. Available keys: "
 | 
					    E978 = ("The {method} method of component {name} takes a list of Example objects, "
 | 
				
			||||||
 | 
					            "but found {types} instead.")
 | 
				
			||||||
 | 
					    E979 = ("Cannot convert {type} to an Example object.")
 | 
				
			||||||
 | 
					    E980 = ("Each link annotation should refer to a dictionary with at most one "
 | 
				
			||||||
 | 
					            "identifier mapping to 1.0, and all others to 0.0.")
 | 
				
			||||||
 | 
					    E981 = ("The offsets of the annotations for 'links' need to refer exactly "
 | 
				
			||||||
 | 
					            "to the offsets of the 'entities' annotations.")
 | 
				
			||||||
 | 
					    E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
 | 
				
			||||||
 | 
					            "into {values}, but found {value}.")
 | 
				
			||||||
 | 
					    E983 = ("Invalid key for '{dict}': {key}. Available keys: "
 | 
				
			||||||
            "{keys}")
 | 
					            "{keys}")
 | 
				
			||||||
    E984 = ("Could not parse the {input} - double check the data is written "
 | 
					 | 
				
			||||||
            "in the correct format as expected by spaCy.")
 | 
					 | 
				
			||||||
    E985 = ("The pipeline component '{component}' is already available in the base "
 | 
					    E985 = ("The pipeline component '{component}' is already available in the base "
 | 
				
			||||||
            "model. The settings in the component block in the config file are "
 | 
					            "model. The settings in the component block in the config file are "
 | 
				
			||||||
            "being ignored. If you want to replace this component instead, set "
 | 
					            "being ignored. If you want to replace this component instead, set "
 | 
				
			||||||
| 
						 | 
					@ -637,10 +643,6 @@ class Errors(object):
 | 
				
			||||||
    E997 = ("Tokenizer special cases are not allowed to modify the text. "
 | 
					    E997 = ("Tokenizer special cases are not allowed to modify the text. "
 | 
				
			||||||
            "This would map '{chunk}' to '{orth}' given token attributes "
 | 
					            "This would map '{chunk}' to '{orth}' given token attributes "
 | 
				
			||||||
            "'{token_attrs}'.")
 | 
					            "'{token_attrs}'.")
 | 
				
			||||||
    E998 = ("To create GoldParse objects from Example objects without a "
 | 
					 | 
				
			||||||
            "Doc, get_gold_parses() should be called with a Vocab object.")
 | 
					 | 
				
			||||||
    E999 = ("Encountered an unexpected format for the dictionary holding "
 | 
					 | 
				
			||||||
            "gold annotations: {gold_dict}")
 | 
					 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,68 +0,0 @@
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .typedefs cimport attr_t
 | 
					 | 
				
			||||||
from .syntax.transition_system cimport Transition
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokens import Doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef struct GoldParseC:
 | 
					 | 
				
			||||||
    int* tags
 | 
					 | 
				
			||||||
    int* heads
 | 
					 | 
				
			||||||
    int* has_dep
 | 
					 | 
				
			||||||
    int* sent_start
 | 
					 | 
				
			||||||
    attr_t* labels
 | 
					 | 
				
			||||||
    int** brackets
 | 
					 | 
				
			||||||
    Transition* ner
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class GoldParse:
 | 
					 | 
				
			||||||
    cdef Pool mem
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef GoldParseC c
 | 
					 | 
				
			||||||
    cdef readonly TokenAnnotation orig
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef int length
 | 
					 | 
				
			||||||
    cdef public int loss
 | 
					 | 
				
			||||||
    cdef public list words
 | 
					 | 
				
			||||||
    cdef public list tags
 | 
					 | 
				
			||||||
    cdef public list pos
 | 
					 | 
				
			||||||
    cdef public list morphs
 | 
					 | 
				
			||||||
    cdef public list lemmas
 | 
					 | 
				
			||||||
    cdef public list sent_starts
 | 
					 | 
				
			||||||
    cdef public list heads
 | 
					 | 
				
			||||||
    cdef public list labels
 | 
					 | 
				
			||||||
    cdef public dict orths
 | 
					 | 
				
			||||||
    cdef public list ner
 | 
					 | 
				
			||||||
    cdef public dict brackets
 | 
					 | 
				
			||||||
    cdef public dict cats
 | 
					 | 
				
			||||||
    cdef public dict links
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef readonly list cand_to_gold
 | 
					 | 
				
			||||||
    cdef readonly list gold_to_cand
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class TokenAnnotation:
 | 
					 | 
				
			||||||
    cdef public list ids
 | 
					 | 
				
			||||||
    cdef public list words
 | 
					 | 
				
			||||||
    cdef public list tags
 | 
					 | 
				
			||||||
    cdef public list pos
 | 
					 | 
				
			||||||
    cdef public list morphs
 | 
					 | 
				
			||||||
    cdef public list lemmas
 | 
					 | 
				
			||||||
    cdef public list heads
 | 
					 | 
				
			||||||
    cdef public list deps
 | 
					 | 
				
			||||||
    cdef public list entities
 | 
					 | 
				
			||||||
    cdef public list sent_starts
 | 
					 | 
				
			||||||
    cdef public dict brackets_by_start
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class DocAnnotation:
 | 
					 | 
				
			||||||
    cdef public object cats
 | 
					 | 
				
			||||||
    cdef public object links
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class Example:
 | 
					 | 
				
			||||||
    cdef public object doc
 | 
					 | 
				
			||||||
    cdef public TokenAnnotation token_annotation
 | 
					 | 
				
			||||||
    cdef public DocAnnotation doc_annotation
 | 
					 | 
				
			||||||
    cdef public object goldparse
 | 
					 | 
				
			||||||
							
								
								
									
										1420
									
								
								spacy/gold.pyx
									
									
									
									
									
								
							
							
						
						
									
										1420
									
								
								spacy/gold.pyx
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										0
									
								
								spacy/gold/__init__.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/gold/__init__.pxd
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										11
									
								
								spacy/gold/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								spacy/gold/__init__.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,11 @@
 | 
				
			||||||
 | 
					from .corpus import Corpus
 | 
				
			||||||
 | 
					from .example import Example
 | 
				
			||||||
 | 
					from .align import align
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .iob_utils import iob_to_biluo, biluo_to_iob
 | 
				
			||||||
 | 
					from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
 | 
				
			||||||
 | 
					from .iob_utils import spans_from_biluo_tags
 | 
				
			||||||
 | 
					from .iob_utils import tags_to_entities
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .gold_io import docs_to_json
 | 
				
			||||||
 | 
					from .gold_io import read_json_file
 | 
				
			||||||
							
								
								
									
										8
									
								
								spacy/gold/align.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								spacy/gold/align.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,8 @@
 | 
				
			||||||
 | 
					cdef class Alignment:
 | 
				
			||||||
 | 
					    cdef public object cost
 | 
				
			||||||
 | 
					    cdef public object i2j
 | 
				
			||||||
 | 
					    cdef public object j2i
 | 
				
			||||||
 | 
					    cdef public object i2j_multi
 | 
				
			||||||
 | 
					    cdef public object j2i_multi
 | 
				
			||||||
 | 
					    cdef public object cand_to_gold
 | 
				
			||||||
 | 
					    cdef public object gold_to_cand
 | 
				
			||||||
							
								
								
									
										101
									
								
								spacy/gold/align.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										101
									
								
								spacy/gold/align.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,101 @@
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
 | 
					from ..errors import Errors, AlignmentError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class Alignment:
 | 
				
			||||||
 | 
					    def __init__(self, spacy_words, gold_words):
 | 
				
			||||||
 | 
					        # Do many-to-one alignment for misaligned tokens.
 | 
				
			||||||
 | 
					        # If we over-segment, we'll have one gold word that covers a sequence
 | 
				
			||||||
 | 
					        # of predicted words
 | 
				
			||||||
 | 
					        # If we under-segment, we'll have one predicted word that covers a
 | 
				
			||||||
 | 
					        # sequence of gold words.
 | 
				
			||||||
 | 
					        # If we "mis-segment", we'll have a sequence of predicted words covering
 | 
				
			||||||
 | 
					        # a sequence of gold words. That's many-to-many -- we don't do that
 | 
				
			||||||
 | 
					        # except for NER spans where the start and end can be aligned.
 | 
				
			||||||
 | 
					        cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words)
 | 
				
			||||||
 | 
					        self.cost = cost
 | 
				
			||||||
 | 
					        self.i2j = i2j
 | 
				
			||||||
 | 
					        self.j2i = j2i
 | 
				
			||||||
 | 
					        self.i2j_multi = i2j_multi
 | 
				
			||||||
 | 
					        self.j2i_multi = j2i_multi
 | 
				
			||||||
 | 
					        self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
 | 
				
			||||||
 | 
					        self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def align(tokens_a, tokens_b):
 | 
				
			||||||
 | 
					    """Calculate alignment tables between two tokenizations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tokens_a (List[str]): The candidate tokenization.
 | 
				
			||||||
 | 
					    tokens_b (List[str]): The reference tokenization.
 | 
				
			||||||
 | 
					    RETURNS: (tuple): A 5-tuple consisting of the following information:
 | 
				
			||||||
 | 
					      * cost (int): The number of misaligned tokens.
 | 
				
			||||||
 | 
					      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
 | 
				
			||||||
 | 
					        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
 | 
				
			||||||
 | 
					        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
 | 
				
			||||||
 | 
					        it has the value -1.
 | 
				
			||||||
 | 
					      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
 | 
				
			||||||
 | 
					      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
 | 
				
			||||||
 | 
					        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
 | 
				
			||||||
 | 
					        the same token of `tokens_b`.
 | 
				
			||||||
 | 
					      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
 | 
				
			||||||
 | 
					            direction.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    tokens_a = _normalize_for_alignment(tokens_a)
 | 
				
			||||||
 | 
					    tokens_b = _normalize_for_alignment(tokens_b)
 | 
				
			||||||
 | 
					    cost = 0
 | 
				
			||||||
 | 
					    a2b = numpy.empty(len(tokens_a), dtype="i")
 | 
				
			||||||
 | 
					    b2a = numpy.empty(len(tokens_b), dtype="i")
 | 
				
			||||||
 | 
					    a2b.fill(-1)
 | 
				
			||||||
 | 
					    b2a.fill(-1)
 | 
				
			||||||
 | 
					    a2b_multi = {}
 | 
				
			||||||
 | 
					    b2a_multi = {}
 | 
				
			||||||
 | 
					    i = 0
 | 
				
			||||||
 | 
					    j = 0
 | 
				
			||||||
 | 
					    offset_a = 0
 | 
				
			||||||
 | 
					    offset_b = 0
 | 
				
			||||||
 | 
					    while i < len(tokens_a) and j < len(tokens_b):
 | 
				
			||||||
 | 
					        a = tokens_a[i][offset_a:]
 | 
				
			||||||
 | 
					        b = tokens_b[j][offset_b:]
 | 
				
			||||||
 | 
					        if a == b:
 | 
				
			||||||
 | 
					            if offset_a == offset_b == 0:
 | 
				
			||||||
 | 
					                a2b[i] = j
 | 
				
			||||||
 | 
					                b2a[j] = i
 | 
				
			||||||
 | 
					            elif offset_a == 0:
 | 
				
			||||||
 | 
					                cost += 2
 | 
				
			||||||
 | 
					                a2b_multi[i] = j
 | 
				
			||||||
 | 
					            elif offset_b == 0:
 | 
				
			||||||
 | 
					                cost += 2
 | 
				
			||||||
 | 
					                b2a_multi[j] = i
 | 
				
			||||||
 | 
					            offset_a = offset_b = 0
 | 
				
			||||||
 | 
					            i += 1
 | 
				
			||||||
 | 
					            j += 1
 | 
				
			||||||
 | 
					        elif a == "":
 | 
				
			||||||
 | 
					            assert offset_a == 0
 | 
				
			||||||
 | 
					            cost += 1
 | 
				
			||||||
 | 
					            i += 1
 | 
				
			||||||
 | 
					        elif b == "":
 | 
				
			||||||
 | 
					            assert offset_b == 0
 | 
				
			||||||
 | 
					            cost += 1
 | 
				
			||||||
 | 
					            j += 1
 | 
				
			||||||
 | 
					        elif b.startswith(a):
 | 
				
			||||||
 | 
					            cost += 1
 | 
				
			||||||
 | 
					            if offset_a == 0:
 | 
				
			||||||
 | 
					                a2b_multi[i] = j
 | 
				
			||||||
 | 
					            i += 1
 | 
				
			||||||
 | 
					            offset_a = 0
 | 
				
			||||||
 | 
					            offset_b += len(a)
 | 
				
			||||||
 | 
					        elif a.startswith(b):
 | 
				
			||||||
 | 
					            cost += 1
 | 
				
			||||||
 | 
					            if offset_b == 0:
 | 
				
			||||||
 | 
					                b2a_multi[j] = i
 | 
				
			||||||
 | 
					            j += 1
 | 
				
			||||||
 | 
					            offset_b = 0
 | 
				
			||||||
 | 
					            offset_a += len(b)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            assert "".join(tokens_a) != "".join(tokens_b)
 | 
				
			||||||
 | 
					            raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
 | 
				
			||||||
 | 
					    return cost, a2b, b2a, a2b_multi, b2a_multi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _normalize_for_alignment(tokens):
 | 
				
			||||||
 | 
					    return [w.replace(" ", "").lower() for w in tokens]
 | 
				
			||||||
							
								
								
									
										111
									
								
								spacy/gold/augment.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										111
									
								
								spacy/gold/augment.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,111 @@
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					import itertools
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def make_orth_variants_example(nlp, example, orth_variant_level=0.0):  # TODO: naming
 | 
				
			||||||
 | 
					    raw_text = example.text
 | 
				
			||||||
 | 
					    orig_dict = example.to_dict()
 | 
				
			||||||
 | 
					    variant_text, variant_token_annot = make_orth_variants(
 | 
				
			||||||
 | 
					        nlp, raw_text, orig_dict["token_annotation"], orth_variant_level
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    doc = nlp.make_doc(variant_text)
 | 
				
			||||||
 | 
					    orig_dict["token_annotation"] = variant_token_annot
 | 
				
			||||||
 | 
					    return example.from_dict(doc, orig_dict)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
 | 
				
			||||||
 | 
					    if random.random() >= orth_variant_level:
 | 
				
			||||||
 | 
					        return raw_text, orig_token_dict
 | 
				
			||||||
 | 
					    if not orig_token_dict:
 | 
				
			||||||
 | 
					        return raw_text, orig_token_dict
 | 
				
			||||||
 | 
					    raw = raw_text
 | 
				
			||||||
 | 
					    token_dict = orig_token_dict
 | 
				
			||||||
 | 
					    lower = False
 | 
				
			||||||
 | 
					    if random.random() >= 0.5:
 | 
				
			||||||
 | 
					        lower = True
 | 
				
			||||||
 | 
					        if raw is not None:
 | 
				
			||||||
 | 
					            raw = raw.lower()
 | 
				
			||||||
 | 
					    ndsv = nlp.Defaults.single_orth_variants
 | 
				
			||||||
 | 
					    ndpv = nlp.Defaults.paired_orth_variants
 | 
				
			||||||
 | 
					    words = token_dict.get("words", [])
 | 
				
			||||||
 | 
					    tags = token_dict.get("tags", [])
 | 
				
			||||||
 | 
					    # keep unmodified if words or tags are not defined
 | 
				
			||||||
 | 
					    if words and tags:
 | 
				
			||||||
 | 
					        if lower:
 | 
				
			||||||
 | 
					            words = [w.lower() for w in words]
 | 
				
			||||||
 | 
					        # single variants
 | 
				
			||||||
 | 
					        punct_choices = [random.choice(x["variants"]) for x in ndsv]
 | 
				
			||||||
 | 
					        for word_idx in range(len(words)):
 | 
				
			||||||
 | 
					            for punct_idx in range(len(ndsv)):
 | 
				
			||||||
 | 
					                if (
 | 
				
			||||||
 | 
					                    tags[word_idx] in ndsv[punct_idx]["tags"]
 | 
				
			||||||
 | 
					                    and words[word_idx] in ndsv[punct_idx]["variants"]
 | 
				
			||||||
 | 
					                ):
 | 
				
			||||||
 | 
					                    words[word_idx] = punct_choices[punct_idx]
 | 
				
			||||||
 | 
					        # paired variants
 | 
				
			||||||
 | 
					        punct_choices = [random.choice(x["variants"]) for x in ndpv]
 | 
				
			||||||
 | 
					        for word_idx in range(len(words)):
 | 
				
			||||||
 | 
					            for punct_idx in range(len(ndpv)):
 | 
				
			||||||
 | 
					                if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
 | 
				
			||||||
 | 
					                    word_idx
 | 
				
			||||||
 | 
					                ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
 | 
				
			||||||
 | 
					                    # backup option: random left vs. right from pair
 | 
				
			||||||
 | 
					                    pair_idx = random.choice([0, 1])
 | 
				
			||||||
 | 
					                    # best option: rely on paired POS tags like `` / ''
 | 
				
			||||||
 | 
					                    if len(ndpv[punct_idx]["tags"]) == 2:
 | 
				
			||||||
 | 
					                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
 | 
				
			||||||
 | 
					                    # next best option: rely on position in variants
 | 
				
			||||||
 | 
					                    # (may not be unambiguous, so order of variants matters)
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        for pair in ndpv[punct_idx]["variants"]:
 | 
				
			||||||
 | 
					                            if words[word_idx] in pair:
 | 
				
			||||||
 | 
					                                pair_idx = pair.index(words[word_idx])
 | 
				
			||||||
 | 
					                    words[word_idx] = punct_choices[punct_idx][pair_idx]
 | 
				
			||||||
 | 
					        token_dict["words"] = words
 | 
				
			||||||
 | 
					        token_dict["tags"] = tags
 | 
				
			||||||
 | 
					    # modify raw
 | 
				
			||||||
 | 
					    if raw is not None:
 | 
				
			||||||
 | 
					        variants = []
 | 
				
			||||||
 | 
					        for single_variants in ndsv:
 | 
				
			||||||
 | 
					            variants.extend(single_variants["variants"])
 | 
				
			||||||
 | 
					        for paired_variants in ndpv:
 | 
				
			||||||
 | 
					            variants.extend(
 | 
				
			||||||
 | 
					                list(itertools.chain.from_iterable(paired_variants["variants"]))
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        # store variants in reverse length order to be able to prioritize
 | 
				
			||||||
 | 
					        # longer matches (e.g., "---" before "--")
 | 
				
			||||||
 | 
					        variants = sorted(variants, key=lambda x: len(x))
 | 
				
			||||||
 | 
					        variants.reverse()
 | 
				
			||||||
 | 
					        variant_raw = ""
 | 
				
			||||||
 | 
					        raw_idx = 0
 | 
				
			||||||
 | 
					        # add initial whitespace
 | 
				
			||||||
 | 
					        while raw_idx < len(raw) and raw[raw_idx].isspace():
 | 
				
			||||||
 | 
					            variant_raw += raw[raw_idx]
 | 
				
			||||||
 | 
					            raw_idx += 1
 | 
				
			||||||
 | 
					        for word in words:
 | 
				
			||||||
 | 
					            match_found = False
 | 
				
			||||||
 | 
					            # skip whitespace words
 | 
				
			||||||
 | 
					            if word.isspace():
 | 
				
			||||||
 | 
					                match_found = True
 | 
				
			||||||
 | 
					            # add identical word
 | 
				
			||||||
 | 
					            elif word not in variants and raw[raw_idx:].startswith(word):
 | 
				
			||||||
 | 
					                variant_raw += word
 | 
				
			||||||
 | 
					                raw_idx += len(word)
 | 
				
			||||||
 | 
					                match_found = True
 | 
				
			||||||
 | 
					            # add variant word
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                for variant in variants:
 | 
				
			||||||
 | 
					                    if not match_found and raw[raw_idx:].startswith(variant):
 | 
				
			||||||
 | 
					                        raw_idx += len(variant)
 | 
				
			||||||
 | 
					                        variant_raw += word
 | 
				
			||||||
 | 
					                        match_found = True
 | 
				
			||||||
 | 
					            # something went wrong, abort
 | 
				
			||||||
 | 
					            # (add a warning message?)
 | 
				
			||||||
 | 
					            if not match_found:
 | 
				
			||||||
 | 
					                return raw_text, orig_token_dict
 | 
				
			||||||
 | 
					            # add following whitespace
 | 
				
			||||||
 | 
					            while raw_idx < len(raw) and raw[raw_idx].isspace():
 | 
				
			||||||
 | 
					                variant_raw += raw[raw_idx]
 | 
				
			||||||
 | 
					                raw_idx += 1
 | 
				
			||||||
 | 
					        raw = variant_raw
 | 
				
			||||||
 | 
					    return raw, token_dict
 | 
				
			||||||
							
								
								
									
										6
									
								
								spacy/gold/converters/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								spacy/gold/converters/__init__.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,6 @@
 | 
				
			||||||
 | 
					from .iob2docs import iob2docs  # noqa: F401
 | 
				
			||||||
 | 
					from .conll_ner2docs import conll_ner2docs  # noqa: F401
 | 
				
			||||||
 | 
					from .json2docs import json2docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO: Update this one
 | 
				
			||||||
 | 
					# from .conllu2docs import conllu2docs  # noqa: F401
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,18 @@
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .. import tags_to_entities
 | 
				
			||||||
from ...gold import iob_to_biluo
 | 
					from ...gold import iob_to_biluo
 | 
				
			||||||
from ...lang.xx import MultiLanguage
 | 
					from ...lang.xx import MultiLanguage
 | 
				
			||||||
from ...tokens.doc import Doc
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
from ...util import load_model
 | 
					from ...util import load_model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def conll_ner2json(
 | 
					def conll_ner2docs(
 | 
				
			||||||
    input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 | 
					    input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Convert files in the CoNLL-2003 NER format and similar
 | 
					    Convert files in the CoNLL-2003 NER format and similar
 | 
				
			||||||
    whitespace-separated columns into JSON format for use with train cli.
 | 
					    whitespace-separated columns into Doc objects.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    The first column is the tokens, the final column is the IOB tags. If an
 | 
					    The first column is the tokens, the final column is the IOB tags. If an
 | 
				
			||||||
    additional second column is present, the second column is the tags.
 | 
					    additional second column is present, the second column is the tags.
 | 
				
			||||||
| 
						 | 
					@ -81,17 +82,25 @@ def conll_ner2json(
 | 
				
			||||||
            "No document delimiters found. Use `-n` to automatically group "
 | 
					            "No document delimiters found. Use `-n` to automatically group "
 | 
				
			||||||
            "sentences into documents."
 | 
					            "sentences into documents."
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if model:
 | 
				
			||||||
 | 
					        nlp = load_model(model)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        nlp = MultiLanguage()
 | 
				
			||||||
    output_docs = []
 | 
					    output_docs = []
 | 
				
			||||||
    for doc in input_data.strip().split(doc_delimiter):
 | 
					    for conll_doc in input_data.strip().split(doc_delimiter):
 | 
				
			||||||
        doc = doc.strip()
 | 
					        conll_doc = conll_doc.strip()
 | 
				
			||||||
        if not doc:
 | 
					        if not conll_doc:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        output_doc = []
 | 
					        words = []
 | 
				
			||||||
        for sent in doc.split("\n\n"):
 | 
					        sent_starts = []
 | 
				
			||||||
            sent = sent.strip()
 | 
					        pos_tags = []
 | 
				
			||||||
            if not sent:
 | 
					        biluo_tags = []
 | 
				
			||||||
 | 
					        for conll_sent in conll_doc.split("\n\n"):
 | 
				
			||||||
 | 
					            conll_sent = conll_sent.strip()
 | 
				
			||||||
 | 
					            if not conll_sent:
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            lines = [line.strip() for line in sent.split("\n") if line.strip()]
 | 
					            lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
 | 
				
			||||||
            cols = list(zip(*[line.split() for line in lines]))
 | 
					            cols = list(zip(*[line.split() for line in lines]))
 | 
				
			||||||
            if len(cols) < 2:
 | 
					            if len(cols) < 2:
 | 
				
			||||||
                raise ValueError(
 | 
					                raise ValueError(
 | 
				
			||||||
| 
						 | 
					@ -99,25 +108,19 @@ def conll_ner2json(
 | 
				
			||||||
                    "Try checking whitespace and delimiters. See "
 | 
					                    "Try checking whitespace and delimiters. See "
 | 
				
			||||||
                    "https://spacy.io/api/cli#convert"
 | 
					                    "https://spacy.io/api/cli#convert"
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            words = cols[0]
 | 
					            length = len(cols[0])
 | 
				
			||||||
            iob_ents = cols[-1]
 | 
					            words.extend(cols[0])
 | 
				
			||||||
            if len(cols) > 2:
 | 
					            sent_starts.extend([True] + [False] * (length - 1))
 | 
				
			||||||
                tags = cols[1]
 | 
					            biluo_tags.extend(iob_to_biluo(cols[-1]))
 | 
				
			||||||
            else:
 | 
					            pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)
 | 
				
			||||||
                tags = ["-"] * len(words)
 | 
					
 | 
				
			||||||
            biluo_ents = iob_to_biluo(iob_ents)
 | 
					        doc = Doc(nlp.vocab, words=words)
 | 
				
			||||||
            output_doc.append(
 | 
					        for i, token in enumerate(doc):
 | 
				
			||||||
                {
 | 
					            token.tag_ = pos_tags[i]
 | 
				
			||||||
                    "tokens": [
 | 
					            token.is_sent_start = sent_starts[i]
 | 
				
			||||||
                        {"orth": w, "tag": tag, "ner": ent}
 | 
					        entities = tags_to_entities(biluo_tags)
 | 
				
			||||||
                        for (w, tag, ent) in zip(words, tags, biluo_ents)
 | 
					        doc.ents = [Span(doc, start=s, end=e + 1, label=L) for L, s, e in entities]
 | 
				
			||||||
                    ]
 | 
					        output_docs.append(doc)
 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        output_docs.append(
 | 
					 | 
				
			||||||
            {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        output_doc = []
 | 
					 | 
				
			||||||
    return output_docs
 | 
					    return output_docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,10 +1,10 @@
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .conll_ner2docs import n_sents_info
 | 
				
			||||||
from ...gold import Example
 | 
					from ...gold import Example
 | 
				
			||||||
from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
 | 
					from ...gold import iob_to_biluo, spans_from_biluo_tags
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...tokens import Doc, Token
 | 
					from ...tokens import Doc, Token
 | 
				
			||||||
from .conll_ner2json import n_sents_info
 | 
					 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,7 +12,6 @@ def conllu2json(
 | 
				
			||||||
    input_data,
 | 
					    input_data,
 | 
				
			||||||
    n_sents=10,
 | 
					    n_sents=10,
 | 
				
			||||||
    append_morphology=False,
 | 
					    append_morphology=False,
 | 
				
			||||||
    lang=None,
 | 
					 | 
				
			||||||
    ner_map=None,
 | 
					    ner_map=None,
 | 
				
			||||||
    merge_subtokens=False,
 | 
					    merge_subtokens=False,
 | 
				
			||||||
    no_print=False,
 | 
					    no_print=False,
 | 
				
			||||||
| 
						 | 
					@ -44,10 +43,7 @@ def conllu2json(
 | 
				
			||||||
        raw += example.text
 | 
					        raw += example.text
 | 
				
			||||||
        sentences.append(
 | 
					        sentences.append(
 | 
				
			||||||
            generate_sentence(
 | 
					            generate_sentence(
 | 
				
			||||||
                example.token_annotation,
 | 
					                example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map,
 | 
				
			||||||
                has_ner_tags,
 | 
					 | 
				
			||||||
                MISC_NER_PATTERN,
 | 
					 | 
				
			||||||
                ner_map=ner_map,
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        # Real-sized documents could be extracted using the comments on the
 | 
					        # Real-sized documents could be extracted using the comments on the
 | 
				
			||||||
| 
						 | 
					@ -145,21 +141,22 @@ def get_entities(lines, tag_pattern, ner_map=None):
 | 
				
			||||||
    return iob_to_biluo(iob)
 | 
					    return iob_to_biluo(iob)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None):
 | 
					def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None):
 | 
				
			||||||
    sentence = {}
 | 
					    sentence = {}
 | 
				
			||||||
    tokens = []
 | 
					    tokens = []
 | 
				
			||||||
    for i, id_ in enumerate(token_annotation.ids):
 | 
					    token_annotation = example_dict["token_annotation"]
 | 
				
			||||||
 | 
					    for i, id_ in enumerate(token_annotation["ids"]):
 | 
				
			||||||
        token = {}
 | 
					        token = {}
 | 
				
			||||||
        token["id"] = id_
 | 
					        token["id"] = id_
 | 
				
			||||||
        token["orth"] = token_annotation.get_word(i)
 | 
					        token["orth"] = token_annotation["words"][i]
 | 
				
			||||||
        token["tag"] = token_annotation.get_tag(i)
 | 
					        token["tag"] = token_annotation["tags"][i]
 | 
				
			||||||
        token["pos"] = token_annotation.get_pos(i)
 | 
					        token["pos"] = token_annotation["pos"][i]
 | 
				
			||||||
        token["lemma"] = token_annotation.get_lemma(i)
 | 
					        token["lemma"] = token_annotation["lemmas"][i]
 | 
				
			||||||
        token["morph"] = token_annotation.get_morph(i)
 | 
					        token["morph"] = token_annotation["morphs"][i]
 | 
				
			||||||
        token["head"] = token_annotation.get_head(i) - id_
 | 
					        token["head"] = token_annotation["heads"][i] - i
 | 
				
			||||||
        token["dep"] = token_annotation.get_dep(i)
 | 
					        token["dep"] = token_annotation["deps"][i]
 | 
				
			||||||
        if has_ner_tags:
 | 
					        if has_ner_tags:
 | 
				
			||||||
            token["ner"] = token_annotation.get_entity(i)
 | 
					            token["ner"] = example_dict["doc_annotation"]["entities"][i]
 | 
				
			||||||
        tokens.append(token)
 | 
					        tokens.append(token)
 | 
				
			||||||
    sentence["tokens"] = tokens
 | 
					    sentence["tokens"] = tokens
 | 
				
			||||||
    return sentence
 | 
					    return sentence
 | 
				
			||||||
| 
						 | 
					@ -267,40 +264,25 @@ def example_from_conllu_sentence(
 | 
				
			||||||
        doc = merge_conllu_subtokens(lines, doc)
 | 
					        doc = merge_conllu_subtokens(lines, doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # create Example from custom Doc annotation
 | 
					    # create Example from custom Doc annotation
 | 
				
			||||||
    ids, words, tags, heads, deps = [], [], [], [], []
 | 
					    words, spaces, tags, morphs, lemmas = [], [], [], [], []
 | 
				
			||||||
    pos, lemmas, morphs, spaces = [], [], [], []
 | 
					 | 
				
			||||||
    for i, t in enumerate(doc):
 | 
					    for i, t in enumerate(doc):
 | 
				
			||||||
        ids.append(i)
 | 
					 | 
				
			||||||
        words.append(t._.merged_orth)
 | 
					        words.append(t._.merged_orth)
 | 
				
			||||||
 | 
					        lemmas.append(t._.merged_lemma)
 | 
				
			||||||
 | 
					        spaces.append(t._.merged_spaceafter)
 | 
				
			||||||
 | 
					        morphs.append(t._.merged_morph)
 | 
				
			||||||
        if append_morphology and t._.merged_morph:
 | 
					        if append_morphology and t._.merged_morph:
 | 
				
			||||||
            tags.append(t.tag_ + "__" + t._.merged_morph)
 | 
					            tags.append(t.tag_ + "__" + t._.merged_morph)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            tags.append(t.tag_)
 | 
					            tags.append(t.tag_)
 | 
				
			||||||
        pos.append(t.pos_)
 | 
					
 | 
				
			||||||
        morphs.append(t._.merged_morph)
 | 
					    doc_x = Doc(vocab, words=words, spaces=spaces)
 | 
				
			||||||
        lemmas.append(t._.merged_lemma)
 | 
					    ref_dict = Example(doc_x, reference=doc).to_dict()
 | 
				
			||||||
        heads.append(t.head.i)
 | 
					    ref_dict["words"] = words
 | 
				
			||||||
        deps.append(t.dep_)
 | 
					    ref_dict["lemmas"] = lemmas
 | 
				
			||||||
        spaces.append(t._.merged_spaceafter)
 | 
					    ref_dict["spaces"] = spaces
 | 
				
			||||||
    ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
 | 
					    ref_dict["tags"] = tags
 | 
				
			||||||
    ents = biluo_tags_from_offsets(doc, ent_offsets)
 | 
					    ref_dict["morphs"] = morphs
 | 
				
			||||||
    raw = ""
 | 
					    example = Example.from_dict(doc_x, ref_dict)
 | 
				
			||||||
    for word, space in zip(words, spaces):
 | 
					 | 
				
			||||||
        raw += word
 | 
					 | 
				
			||||||
        if space:
 | 
					 | 
				
			||||||
            raw += " "
 | 
					 | 
				
			||||||
    example = Example(doc=raw)
 | 
					 | 
				
			||||||
    example.set_token_annotation(
 | 
					 | 
				
			||||||
        ids=ids,
 | 
					 | 
				
			||||||
        words=words,
 | 
					 | 
				
			||||||
        tags=tags,
 | 
					 | 
				
			||||||
        pos=pos,
 | 
					 | 
				
			||||||
        morphs=morphs,
 | 
					 | 
				
			||||||
        lemmas=lemmas,
 | 
					 | 
				
			||||||
        heads=heads,
 | 
					 | 
				
			||||||
        deps=deps,
 | 
					 | 
				
			||||||
        entities=ents,
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    return example
 | 
					    return example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										64
									
								
								spacy/gold/converters/iob2docs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										64
									
								
								spacy/gold/converters/iob2docs.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,64 @@
 | 
				
			||||||
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .conll_ner2docs import n_sents_info
 | 
				
			||||||
 | 
					from ...gold import iob_to_biluo, tags_to_entities
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					from ...util import minibatch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Convert IOB files with one sentence per line and tags separated with '|'
 | 
				
			||||||
 | 
					    into Doc objects so they can be saved. IOB and IOB2 are accepted.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Sample formats:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
 | 
				
			||||||
 | 
					    I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
 | 
				
			||||||
 | 
					    I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
 | 
				
			||||||
 | 
					    I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    msg = Printer(no_print=no_print)
 | 
				
			||||||
 | 
					    if n_sents > 0:
 | 
				
			||||||
 | 
					        n_sents_info(msg, n_sents)
 | 
				
			||||||
 | 
					    docs = read_iob(input_data.split("\n"), vocab, n_sents)
 | 
				
			||||||
 | 
					    return docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def read_iob(raw_sents, vocab, n_sents):
 | 
				
			||||||
 | 
					    docs = []
 | 
				
			||||||
 | 
					    for group in minibatch(raw_sents, size=n_sents):
 | 
				
			||||||
 | 
					        tokens = []
 | 
				
			||||||
 | 
					        words = []
 | 
				
			||||||
 | 
					        tags = []
 | 
				
			||||||
 | 
					        iob = []
 | 
				
			||||||
 | 
					        sent_starts = []
 | 
				
			||||||
 | 
					        for line in group:
 | 
				
			||||||
 | 
					            if not line.strip():
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            sent_tokens = [t.split("|") for t in line.split()]
 | 
				
			||||||
 | 
					            if len(sent_tokens[0]) == 3:
 | 
				
			||||||
 | 
					                sent_words, sent_tags, sent_iob = zip(*sent_tokens)
 | 
				
			||||||
 | 
					            elif len(sent_tokens[0]) == 2:
 | 
				
			||||||
 | 
					                sent_words, sent_iob = zip(*sent_tokens)
 | 
				
			||||||
 | 
					                sent_tags = ["-"] * len(sent_words)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                raise ValueError(
 | 
				
			||||||
 | 
					                    "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            words.extend(sent_words)
 | 
				
			||||||
 | 
					            tags.extend(sent_tags)
 | 
				
			||||||
 | 
					            iob.extend(sent_iob)
 | 
				
			||||||
 | 
					            tokens.extend(sent_tokens)
 | 
				
			||||||
 | 
					            sent_starts.append(True)
 | 
				
			||||||
 | 
					            sent_starts.extend([False for _ in sent_words[1:]])
 | 
				
			||||||
 | 
					        doc = Doc(vocab, words=words)
 | 
				
			||||||
 | 
					        for i, tag in enumerate(tags):
 | 
				
			||||||
 | 
					            doc[i].tag_ = tag
 | 
				
			||||||
 | 
					        for i, sent_start in enumerate(sent_starts):
 | 
				
			||||||
 | 
					            doc[i].is_sent_start = sent_start
 | 
				
			||||||
 | 
					        biluo = iob_to_biluo(iob)
 | 
				
			||||||
 | 
					        entities = tags_to_entities(biluo)
 | 
				
			||||||
 | 
					        doc.ents = [Span(doc, start=s, end=e+1, label=L) for (L, s, e) in entities]
 | 
				
			||||||
 | 
					        docs.append(doc)
 | 
				
			||||||
 | 
					    return docs
 | 
				
			||||||
							
								
								
									
										24
									
								
								spacy/gold/converters/json2docs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								spacy/gold/converters/json2docs.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,24 @@
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					from ..gold_io import json_iterate, json_to_annotations
 | 
				
			||||||
 | 
					from ..example import annotations2doc
 | 
				
			||||||
 | 
					from ..example import _fix_legacy_dict_data, _parse_example_dict_data
 | 
				
			||||||
 | 
					from ...util import load_model
 | 
				
			||||||
 | 
					from ...lang.xx import MultiLanguage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def json2docs(input_data, model=None, **kwargs):
 | 
				
			||||||
 | 
					    nlp = load_model(model) if model is not None else MultiLanguage()
 | 
				
			||||||
 | 
					    if not isinstance(input_data, bytes):
 | 
				
			||||||
 | 
					        if not isinstance(input_data, str):
 | 
				
			||||||
 | 
					            input_data = srsly.json_dumps(input_data)
 | 
				
			||||||
 | 
					        input_data = input_data.encode("utf8")
 | 
				
			||||||
 | 
					    docs = []
 | 
				
			||||||
 | 
					    for json_doc in json_iterate(input_data):
 | 
				
			||||||
 | 
					        for json_para in json_to_annotations(json_doc):
 | 
				
			||||||
 | 
					            example_dict = _fix_legacy_dict_data(json_para)
 | 
				
			||||||
 | 
					            tok_dict, doc_dict = _parse_example_dict_data(example_dict)
 | 
				
			||||||
 | 
					            if json_para.get("raw"):
 | 
				
			||||||
 | 
					                assert tok_dict.get("SPACY")
 | 
				
			||||||
 | 
					            doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
 | 
				
			||||||
 | 
					            docs.append(doc)
 | 
				
			||||||
 | 
					    return docs
 | 
				
			||||||
							
								
								
									
										122
									
								
								spacy/gold/corpus.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										122
									
								
								spacy/gold/corpus.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,122 @@
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					from .. import util
 | 
				
			||||||
 | 
					from .example import Example
 | 
				
			||||||
 | 
					from ..tokens import DocBin, Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Corpus:
 | 
				
			||||||
 | 
					    """An annotated corpus, reading train and dev datasets from
 | 
				
			||||||
 | 
					    the DocBin (.spacy) format.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    DOCS: https://spacy.io/api/goldcorpus
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, train_loc, dev_loc, limit=0):
 | 
				
			||||||
 | 
					        """Create a Corpus.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        train (str / Path): File or directory of training data.
 | 
				
			||||||
 | 
					        dev (str / Path): File or directory of development data.
 | 
				
			||||||
 | 
					        limit (int): Max. number of examples returned
 | 
				
			||||||
 | 
					        RETURNS (Corpus): The newly created object.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        self.train_loc = train_loc
 | 
				
			||||||
 | 
					        self.dev_loc = dev_loc
 | 
				
			||||||
 | 
					        self.limit = limit
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def walk_corpus(path):
 | 
				
			||||||
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
 | 
					        if not path.is_dir():
 | 
				
			||||||
 | 
					            return [path]
 | 
				
			||||||
 | 
					        paths = [path]
 | 
				
			||||||
 | 
					        locs = []
 | 
				
			||||||
 | 
					        seen = set()
 | 
				
			||||||
 | 
					        for path in paths:
 | 
				
			||||||
 | 
					            if str(path) in seen:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            seen.add(str(path))
 | 
				
			||||||
 | 
					            if path.parts[-1].startswith("."):
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            elif path.is_dir():
 | 
				
			||||||
 | 
					                paths.extend(path.iterdir())
 | 
				
			||||||
 | 
					            elif path.parts[-1].endswith(".spacy"):
 | 
				
			||||||
 | 
					                locs.append(path)
 | 
				
			||||||
 | 
					        return locs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def make_examples(self, nlp, reference_docs, max_length=0):
 | 
				
			||||||
 | 
					        for reference in reference_docs:
 | 
				
			||||||
 | 
					            if max_length >= 1 and len(reference) >= max_length:
 | 
				
			||||||
 | 
					                if reference.is_sentenced:
 | 
				
			||||||
 | 
					                    for ref_sent in reference.sents:
 | 
				
			||||||
 | 
					                        yield Example(
 | 
				
			||||||
 | 
					                            nlp.make_doc(ref_sent.text),
 | 
				
			||||||
 | 
					                            ref_sent.as_doc()
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                yield Example(
 | 
				
			||||||
 | 
					                    nlp.make_doc(reference.text),
 | 
				
			||||||
 | 
					                    reference
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def make_examples_gold_preproc(self, nlp, reference_docs):
 | 
				
			||||||
 | 
					        for reference in reference_docs:
 | 
				
			||||||
 | 
					            if reference.is_sentenced:
 | 
				
			||||||
 | 
					                ref_sents = [sent.as_doc() for sent in reference.sents]
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                ref_sents = [reference]
 | 
				
			||||||
 | 
					            for ref_sent in ref_sents:
 | 
				
			||||||
 | 
					                yield Example(
 | 
				
			||||||
 | 
					                    Doc(
 | 
				
			||||||
 | 
					                        nlp.vocab, 
 | 
				
			||||||
 | 
					                        words=[w.text for w in ref_sent],
 | 
				
			||||||
 | 
					                        spaces=[bool(w.whitespace_) for w in ref_sent]
 | 
				
			||||||
 | 
					                    ),
 | 
				
			||||||
 | 
					                    ref_sent
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def read_docbin(self, vocab, locs):
 | 
				
			||||||
 | 
					        """ Yield training examples as example dicts """
 | 
				
			||||||
 | 
					        i = 0
 | 
				
			||||||
 | 
					        for loc in locs:
 | 
				
			||||||
 | 
					            loc = util.ensure_path(loc)
 | 
				
			||||||
 | 
					            if loc.parts[-1].endswith(".spacy"):
 | 
				
			||||||
 | 
					                with loc.open("rb") as file_:
 | 
				
			||||||
 | 
					                    doc_bin = DocBin().from_bytes(file_.read())
 | 
				
			||||||
 | 
					                docs = doc_bin.get_docs(vocab)
 | 
				
			||||||
 | 
					                for doc in docs:
 | 
				
			||||||
 | 
					                    if len(doc):
 | 
				
			||||||
 | 
					                        yield doc
 | 
				
			||||||
 | 
					                        i += 1
 | 
				
			||||||
 | 
					                        if self.limit >= 1 and i >= self.limit:
 | 
				
			||||||
 | 
					                            break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def count_train(self, nlp):
 | 
				
			||||||
 | 
					        """Returns count of words in train examples"""
 | 
				
			||||||
 | 
					        n = 0
 | 
				
			||||||
 | 
					        i = 0
 | 
				
			||||||
 | 
					        for example in self.train_dataset(nlp):
 | 
				
			||||||
 | 
					            n += len(example.predicted)
 | 
				
			||||||
 | 
					            if self.limit >= 0 and i >= self.limit:
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					            i += 1
 | 
				
			||||||
 | 
					        return n
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def train_dataset(self, nlp, *, shuffle=True, gold_preproc=False,
 | 
				
			||||||
 | 
					            max_length=0, **kwargs):
 | 
				
			||||||
 | 
					        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
 | 
				
			||||||
 | 
					        if gold_preproc:
 | 
				
			||||||
 | 
					            examples = self.make_examples_gold_preproc(nlp, ref_docs)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            examples = self.make_examples(nlp, ref_docs, max_length)
 | 
				
			||||||
 | 
					        if shuffle:
 | 
				
			||||||
 | 
					            examples = list(examples)
 | 
				
			||||||
 | 
					            random.shuffle(examples)
 | 
				
			||||||
 | 
					        yield from examples
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs):
 | 
				
			||||||
 | 
					        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
 | 
				
			||||||
 | 
					        if gold_preproc:
 | 
				
			||||||
 | 
					            examples = self.make_examples_gold_preproc(nlp, ref_docs)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            examples = self.make_examples(nlp, ref_docs, max_length=0)
 | 
				
			||||||
 | 
					        yield from examples
 | 
				
			||||||
							
								
								
									
										8
									
								
								spacy/gold/example.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								spacy/gold/example.pxd
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,8 @@
 | 
				
			||||||
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
 | 
					from .align cimport Alignment
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class Example:
 | 
				
			||||||
 | 
					    cdef readonly Doc x
 | 
				
			||||||
 | 
					    cdef readonly Doc y
 | 
				
			||||||
 | 
					    cdef readonly Alignment _alignment
 | 
				
			||||||
							
								
								
									
										434
									
								
								spacy/gold/example.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										434
									
								
								spacy/gold/example.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,434 @@
 | 
				
			||||||
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..tokens import Token
 | 
				
			||||||
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
 | 
					from ..tokens.span cimport Span
 | 
				
			||||||
 | 
					from ..tokens.span import Span
 | 
				
			||||||
 | 
					from ..attrs import IDS
 | 
				
			||||||
 | 
					from .align cimport Alignment
 | 
				
			||||||
 | 
					from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
 | 
				
			||||||
 | 
					from .iob_utils import spans_from_biluo_tags
 | 
				
			||||||
 | 
					from .align import Alignment
 | 
				
			||||||
 | 
					from ..errors import Errors, AlignmentError
 | 
				
			||||||
 | 
					from ..syntax import nonproj
 | 
				
			||||||
 | 
					from ..util import get_words_and_spaces
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
 | 
				
			||||||
 | 
					    """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
 | 
				
			||||||
 | 
					    attrs, array = _annot2array(vocab, tok_annot, doc_annot)
 | 
				
			||||||
 | 
					    output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
 | 
				
			||||||
 | 
					    if "entities" in doc_annot:
 | 
				
			||||||
 | 
					       _add_entities_to_doc(output, doc_annot["entities"])
 | 
				
			||||||
 | 
					    if array.size:
 | 
				
			||||||
 | 
					        output = output.from_array(attrs, array)
 | 
				
			||||||
 | 
					    # links are currently added with ENT_KB_ID on the token level
 | 
				
			||||||
 | 
					    output.cats.update(doc_annot.get("cats", {}))
 | 
				
			||||||
 | 
					    return output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class Example:
 | 
				
			||||||
 | 
					    def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
 | 
				
			||||||
 | 
					        """ Doc can either be text, or an actual Doc """
 | 
				
			||||||
 | 
					        msg = "Example.__init__ got None for '{arg}'. Requires Doc."
 | 
				
			||||||
 | 
					        if predicted is None:
 | 
				
			||||||
 | 
					            raise TypeError(msg.format(arg="predicted"))
 | 
				
			||||||
 | 
					        if reference is None:
 | 
				
			||||||
 | 
					            raise TypeError(msg.format(arg="reference"))
 | 
				
			||||||
 | 
					        self.x = predicted
 | 
				
			||||||
 | 
					        self.y = reference
 | 
				
			||||||
 | 
					        self._alignment = alignment
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    property predicted:
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            return self.x
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __set__(self, doc):
 | 
				
			||||||
 | 
					            self.x = doc
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    property reference:
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            return self.y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __set__(self, doc):
 | 
				
			||||||
 | 
					            self.y = doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def copy(self):
 | 
				
			||||||
 | 
					        return Example(
 | 
				
			||||||
 | 
					            self.x.copy(),
 | 
				
			||||||
 | 
					            self.y.copy()
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def from_dict(cls, Doc predicted, dict example_dict):
 | 
				
			||||||
 | 
					        if example_dict is None:
 | 
				
			||||||
 | 
					            raise ValueError("Example.from_dict expected dict, received None")
 | 
				
			||||||
 | 
					        if not isinstance(predicted, Doc):
 | 
				
			||||||
 | 
					            raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}")
 | 
				
			||||||
 | 
					        example_dict = _fix_legacy_dict_data(example_dict)
 | 
				
			||||||
 | 
					        tok_dict, doc_dict = _parse_example_dict_data(example_dict)
 | 
				
			||||||
 | 
					        if "ORTH" not in tok_dict:
 | 
				
			||||||
 | 
					            tok_dict["ORTH"] = [tok.text for tok in predicted]
 | 
				
			||||||
 | 
					            tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
 | 
				
			||||||
 | 
					        if not _has_field(tok_dict, "SPACY"):
 | 
				
			||||||
 | 
					            spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
 | 
				
			||||||
 | 
					        return Example(
 | 
				
			||||||
 | 
					            predicted,
 | 
				
			||||||
 | 
					            annotations2doc(predicted.vocab, tok_dict, doc_dict)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def alignment(self):
 | 
				
			||||||
 | 
					        if self._alignment is None:
 | 
				
			||||||
 | 
					            spacy_words = [token.orth_ for token in self.predicted]
 | 
				
			||||||
 | 
					            gold_words = [token.orth_ for token in self.reference]
 | 
				
			||||||
 | 
					            if gold_words == []:
 | 
				
			||||||
 | 
					                gold_words = spacy_words
 | 
				
			||||||
 | 
					            self._alignment = Alignment(spacy_words, gold_words)
 | 
				
			||||||
 | 
					        return self._alignment
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_aligned(self, field, as_string=False):
 | 
				
			||||||
 | 
					        """Return an aligned array for a token attribute."""
 | 
				
			||||||
 | 
					        i2j_multi = self.alignment.i2j_multi
 | 
				
			||||||
 | 
					        cand_to_gold = self.alignment.cand_to_gold
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        vocab = self.reference.vocab
 | 
				
			||||||
 | 
					        gold_values = self.reference.to_array([field])
 | 
				
			||||||
 | 
					        output = [None] * len(self.predicted)
 | 
				
			||||||
 | 
					        for i, gold_i in enumerate(cand_to_gold):
 | 
				
			||||||
 | 
					            if self.predicted[i].text.isspace():
 | 
				
			||||||
 | 
					                output[i] = None
 | 
				
			||||||
 | 
					            if gold_i is None:
 | 
				
			||||||
 | 
					                if i in i2j_multi:
 | 
				
			||||||
 | 
					                    output[i] = gold_values[i2j_multi[i]]
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    output[i] = None
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                output[i] = gold_values[gold_i]
 | 
				
			||||||
 | 
					        if as_string and field not in ["ENT_IOB", "SENT_START"]:
 | 
				
			||||||
 | 
					            output = [vocab.strings[o] if o is not None else o for o in output]
 | 
				
			||||||
 | 
					        return output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_aligned_parse(self, projectivize=True):
 | 
				
			||||||
 | 
					        cand_to_gold = self.alignment.cand_to_gold
 | 
				
			||||||
 | 
					        gold_to_cand = self.alignment.gold_to_cand
 | 
				
			||||||
 | 
					        aligned_heads = [None] * self.x.length
 | 
				
			||||||
 | 
					        aligned_deps = [None] * self.x.length
 | 
				
			||||||
 | 
					        heads = [token.head.i for token in self.y]
 | 
				
			||||||
 | 
					        deps = [token.dep_ for token in self.y]
 | 
				
			||||||
 | 
					        heads, deps = nonproj.projectivize(heads, deps)
 | 
				
			||||||
 | 
					        for cand_i in range(self.x.length):
 | 
				
			||||||
 | 
					            gold_i = cand_to_gold[cand_i]
 | 
				
			||||||
 | 
					            if gold_i is not None: # Alignment found
 | 
				
			||||||
 | 
					                gold_head = gold_to_cand[heads[gold_i]]
 | 
				
			||||||
 | 
					                if gold_head is not None:
 | 
				
			||||||
 | 
					                    aligned_heads[cand_i] = gold_head
 | 
				
			||||||
 | 
					                    aligned_deps[cand_i] = deps[gold_i]
 | 
				
			||||||
 | 
					        return aligned_heads, aligned_deps
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_aligned_ner(self):
 | 
				
			||||||
 | 
					        if not self.y.is_nered:
 | 
				
			||||||
 | 
					            return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
 | 
				
			||||||
 | 
					        x_text = self.x.text
 | 
				
			||||||
 | 
					        # Get a list of entities, and make spans for non-entity tokens.
 | 
				
			||||||
 | 
					        # We then work through the spans in order, trying to find them in
 | 
				
			||||||
 | 
					        # the text and using that to get the offset. Any token that doesn't
 | 
				
			||||||
 | 
					        # get a tag set this way is tagged None.
 | 
				
			||||||
 | 
					        # This could maybe be improved? It at least feels easy to reason about.
 | 
				
			||||||
 | 
					        y_spans = list(self.y.ents)
 | 
				
			||||||
 | 
					        y_spans.sort()
 | 
				
			||||||
 | 
					        x_text_offset = 0
 | 
				
			||||||
 | 
					        x_spans = []
 | 
				
			||||||
 | 
					        for y_span in y_spans:
 | 
				
			||||||
 | 
					            if x_text.count(y_span.text) >= 1:
 | 
				
			||||||
 | 
					                start_char = x_text.index(y_span.text) + x_text_offset
 | 
				
			||||||
 | 
					                end_char = start_char + len(y_span.text)
 | 
				
			||||||
 | 
					                x_span = self.x.char_span(start_char, end_char, label=y_span.label)
 | 
				
			||||||
 | 
					                if x_span is not None:
 | 
				
			||||||
 | 
					                    x_spans.append(x_span)
 | 
				
			||||||
 | 
					                    x_text = self.x.text[end_char:]
 | 
				
			||||||
 | 
					                    x_text_offset = end_char
 | 
				
			||||||
 | 
					        x_tags = biluo_tags_from_offsets(
 | 
				
			||||||
 | 
					            self.x, 
 | 
				
			||||||
 | 
					            [(e.start_char, e.end_char, e.label_) for e in x_spans],
 | 
				
			||||||
 | 
					            missing=None
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        gold_to_cand = self.alignment.gold_to_cand
 | 
				
			||||||
 | 
					        for token in self.y:
 | 
				
			||||||
 | 
					            if token.ent_iob_ == "O":
 | 
				
			||||||
 | 
					                cand_i = gold_to_cand[token.i]
 | 
				
			||||||
 | 
					                if cand_i is not None and x_tags[cand_i] is None:
 | 
				
			||||||
 | 
					                    x_tags[cand_i] = "O"
 | 
				
			||||||
 | 
					        i2j_multi = self.alignment.i2j_multi
 | 
				
			||||||
 | 
					        for i, tag in enumerate(x_tags):
 | 
				
			||||||
 | 
					            if tag is None and i in i2j_multi:
 | 
				
			||||||
 | 
					                gold_i = i2j_multi[i]
 | 
				
			||||||
 | 
					                if gold_i is not None and self.y[gold_i].ent_iob_ == "O":
 | 
				
			||||||
 | 
					                    x_tags[i] = "O"
 | 
				
			||||||
 | 
					        return x_tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_dict(self):
 | 
				
			||||||
 | 
					        return {
 | 
				
			||||||
 | 
					            "doc_annotation": {
 | 
				
			||||||
 | 
					                "cats": dict(self.reference.cats),
 | 
				
			||||||
 | 
					                "entities": biluo_tags_from_doc(self.reference),
 | 
				
			||||||
 | 
					                "links": self._links_to_dict()
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "token_annotation": {
 | 
				
			||||||
 | 
					                "ids": [t.i+1 for t in self.reference],
 | 
				
			||||||
 | 
					                "words": [t.text for t in self.reference],
 | 
				
			||||||
 | 
					                "tags": [t.tag_ for t in self.reference],
 | 
				
			||||||
 | 
					                "lemmas": [t.lemma_ for t in self.reference],
 | 
				
			||||||
 | 
					                "pos": [t.pos_ for t in self.reference],
 | 
				
			||||||
 | 
					                "morphs": [t.morph_ for t in self.reference],
 | 
				
			||||||
 | 
					                "heads": [t.head.i for t in self.reference],
 | 
				
			||||||
 | 
					                "deps": [t.dep_ for t in self.reference],
 | 
				
			||||||
 | 
					                "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference]
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _links_to_dict(self):
 | 
				
			||||||
 | 
					        links = {}
 | 
				
			||||||
 | 
					        for ent in self.reference.ents:
 | 
				
			||||||
 | 
					            if ent.kb_id_:
 | 
				
			||||||
 | 
					                links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
 | 
				
			||||||
 | 
					        return links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def split_sents(self):
 | 
				
			||||||
 | 
					        """ Split the token annotations into multiple Examples based on
 | 
				
			||||||
 | 
					        sent_starts and return a list of the new Examples"""
 | 
				
			||||||
 | 
					        if not self.reference.is_sentenced:
 | 
				
			||||||
 | 
					            return [self]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        sent_starts = self.get_aligned("SENT_START")
 | 
				
			||||||
 | 
					        sent_starts.append(1)   # appending virtual start of a next sentence to facilitate search
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        output = []
 | 
				
			||||||
 | 
					        pred_start = 0
 | 
				
			||||||
 | 
					        for sent in self.reference.sents:
 | 
				
			||||||
 | 
					            new_ref = sent.as_doc()
 | 
				
			||||||
 | 
					            pred_end = sent_starts.index(1, pred_start+1)  # find where the next sentence starts
 | 
				
			||||||
 | 
					            new_pred = self.predicted[pred_start : pred_end].as_doc()
 | 
				
			||||||
 | 
					            output.append(Example(new_pred, new_ref))
 | 
				
			||||||
 | 
					            pred_start = pred_end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    property text:
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            return self.x.text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __str__(self):
 | 
				
			||||||
 | 
					        return str(self.to_dict())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __repr__(self):
 | 
				
			||||||
 | 
					        return str(self.to_dict())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _annot2array(vocab, tok_annot, doc_annot):
 | 
				
			||||||
 | 
					    attrs = []
 | 
				
			||||||
 | 
					    values = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for key, value in doc_annot.items():
 | 
				
			||||||
 | 
					        if value:
 | 
				
			||||||
 | 
					            if key == "entities":
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					            elif key == "links":
 | 
				
			||||||
 | 
					                entities = doc_annot.get("entities", {})
 | 
				
			||||||
 | 
					                if not entities:
 | 
				
			||||||
 | 
					                    raise ValueError(Errors.E981)
 | 
				
			||||||
 | 
					                ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities)
 | 
				
			||||||
 | 
					                tok_annot["ENT_KB_ID"] = ent_kb_ids
 | 
				
			||||||
 | 
					            elif key == "cats":
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                raise ValueError(f"Unknown doc attribute: {key}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for key, value in tok_annot.items():
 | 
				
			||||||
 | 
					        if key not in IDS:
 | 
				
			||||||
 | 
					            raise ValueError(f"Unknown token attribute: {key}")
 | 
				
			||||||
 | 
					        elif key in ["ORTH", "SPACY"]:
 | 
				
			||||||
 | 
					            pass
 | 
				
			||||||
 | 
					        elif key == "HEAD":
 | 
				
			||||||
 | 
					            attrs.append(key)
 | 
				
			||||||
 | 
					            values.append([h-i for i, h in enumerate(value)])
 | 
				
			||||||
 | 
					        elif key == "SENT_START":
 | 
				
			||||||
 | 
					            attrs.append(key)
 | 
				
			||||||
 | 
					            values.append(value)
 | 
				
			||||||
 | 
					        elif key == "MORPH":
 | 
				
			||||||
 | 
					            attrs.append(key)
 | 
				
			||||||
 | 
					            values.append([vocab.morphology.add(v) for v in value])
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            attrs.append(key)
 | 
				
			||||||
 | 
					            values.append([vocab.strings.add(v) for v in value])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    array = numpy.asarray(values, dtype="uint64")
 | 
				
			||||||
 | 
					    return attrs, array.T
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _add_entities_to_doc(doc, ner_data):
 | 
				
			||||||
 | 
					    if ner_data is None:
 | 
				
			||||||
 | 
					        return
 | 
				
			||||||
 | 
					    elif ner_data == []:
 | 
				
			||||||
 | 
					        doc.ents = []
 | 
				
			||||||
 | 
					    elif isinstance(ner_data[0], tuple):
 | 
				
			||||||
 | 
					        return _add_entities_to_doc(
 | 
				
			||||||
 | 
					            doc,
 | 
				
			||||||
 | 
					            biluo_tags_from_offsets(doc, ner_data)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    elif isinstance(ner_data[0], str) or ner_data[0] is None:
 | 
				
			||||||
 | 
					        return _add_entities_to_doc(
 | 
				
			||||||
 | 
					            doc,
 | 
				
			||||||
 | 
					            spans_from_biluo_tags(doc, ner_data)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    elif isinstance(ner_data[0], Span):
 | 
				
			||||||
 | 
					        # Ugh, this is super messy. Really hard to set O entities
 | 
				
			||||||
 | 
					        doc.ents = ner_data
 | 
				
			||||||
 | 
					        doc.ents = [span for span in ner_data if span.label_]
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        raise ValueError("Unexpected type for NER data")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _parse_example_dict_data(example_dict):
 | 
				
			||||||
 | 
					    return (
 | 
				
			||||||
 | 
					        example_dict["token_annotation"],
 | 
				
			||||||
 | 
					        example_dict["doc_annotation"]
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _fix_legacy_dict_data(example_dict):
 | 
				
			||||||
 | 
					    token_dict = example_dict.get("token_annotation", {})
 | 
				
			||||||
 | 
					    doc_dict = example_dict.get("doc_annotation", {})
 | 
				
			||||||
 | 
					    for key, value in example_dict.items():
 | 
				
			||||||
 | 
					        if value:
 | 
				
			||||||
 | 
					            if key in ("token_annotation", "doc_annotation"):
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					            elif key == "ids":
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					            elif key in ("cats", "links"):
 | 
				
			||||||
 | 
					                doc_dict[key] = value
 | 
				
			||||||
 | 
					            elif key in ("ner", "entities"):
 | 
				
			||||||
 | 
					                doc_dict["entities"] = value
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                token_dict[key] = value
 | 
				
			||||||
 | 
					    # Remap keys
 | 
				
			||||||
 | 
					    remapping = {
 | 
				
			||||||
 | 
					        "words": "ORTH",
 | 
				
			||||||
 | 
					        "tags": "TAG",
 | 
				
			||||||
 | 
					        "pos": "POS",
 | 
				
			||||||
 | 
					        "lemmas": "LEMMA",
 | 
				
			||||||
 | 
					        "deps": "DEP",
 | 
				
			||||||
 | 
					        "heads": "HEAD",
 | 
				
			||||||
 | 
					        "sent_starts": "SENT_START",
 | 
				
			||||||
 | 
					        "morphs": "MORPH",
 | 
				
			||||||
 | 
					        "spaces": "SPACY",
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    old_token_dict = token_dict
 | 
				
			||||||
 | 
					    token_dict = {}
 | 
				
			||||||
 | 
					    for key, value in old_token_dict.items():
 | 
				
			||||||
 | 
					        if key in ("text", "ids", "brackets"):
 | 
				
			||||||
 | 
					            pass
 | 
				
			||||||
 | 
					        elif key in remapping:
 | 
				
			||||||
 | 
					            token_dict[remapping[key]] = value
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
 | 
				
			||||||
 | 
					    text = example_dict.get("text", example_dict.get("raw"))
 | 
				
			||||||
 | 
					    if _has_field(token_dict, "ORTH") and not _has_field(token_dict, "SPACY"):
 | 
				
			||||||
 | 
					        token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"])
 | 
				
			||||||
 | 
					    if "HEAD" in token_dict and "SENT_START" in token_dict:
 | 
				
			||||||
 | 
					        # If heads are set, we don't also redundantly specify SENT_START.
 | 
				
			||||||
 | 
					        token_dict.pop("SENT_START")
 | 
				
			||||||
 | 
					        warnings.warn("Ignoring annotations for sentence starts, as dependency heads are set")
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					        "token_annotation": token_dict,
 | 
				
			||||||
 | 
					        "doc_annotation": doc_dict
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _has_field(annot, field):
 | 
				
			||||||
 | 
					    if field not in annot:
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					    elif annot[field] is None:
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					    elif len(annot[field]) == 0:
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					    elif all([value is None for value in annot[field]]):
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
 | 
				
			||||||
 | 
					    if isinstance(biluo_or_offsets[0], (list, tuple)):
 | 
				
			||||||
 | 
					        # Convert to biluo if necessary
 | 
				
			||||||
 | 
					        # This is annoying but to convert the offsets we need a Doc
 | 
				
			||||||
 | 
					        # that has the target tokenization.
 | 
				
			||||||
 | 
					        reference = Doc(vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					        biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        biluo = biluo_or_offsets
 | 
				
			||||||
 | 
					    ent_iobs = []
 | 
				
			||||||
 | 
					    ent_types = []
 | 
				
			||||||
 | 
					    for iob_tag in biluo_to_iob(biluo):
 | 
				
			||||||
 | 
					        if iob_tag in (None, "-"):
 | 
				
			||||||
 | 
					            ent_iobs.append("")
 | 
				
			||||||
 | 
					            ent_types.append("")
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            ent_iobs.append(iob_tag.split("-")[0])
 | 
				
			||||||
 | 
					            if iob_tag.startswith("I") or iob_tag.startswith("B"):
 | 
				
			||||||
 | 
					                ent_types.append(iob_tag.split("-", 1)[1])
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                ent_types.append("")
 | 
				
			||||||
 | 
					    return ent_iobs, ent_types
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _parse_links(vocab, words, links, entities):
 | 
				
			||||||
 | 
					    reference = Doc(vocab, words=words)
 | 
				
			||||||
 | 
					    starts = {token.idx: token.i for token in reference}
 | 
				
			||||||
 | 
					    ends = {token.idx + len(token): token.i for token in reference}
 | 
				
			||||||
 | 
					    ent_kb_ids = ["" for _ in reference]
 | 
				
			||||||
 | 
					    entity_map = [(ent[0], ent[1]) for ent in entities]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # links annotations need to refer 1-1 to entity annotations - throw error otherwise
 | 
				
			||||||
 | 
					    for index, annot_dict in links.items():
 | 
				
			||||||
 | 
					        start_char, end_char = index
 | 
				
			||||||
 | 
					        if (start_char, end_char) not in entity_map:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E981)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for index, annot_dict in links.items():
 | 
				
			||||||
 | 
					        true_kb_ids = []
 | 
				
			||||||
 | 
					        for key, value in annot_dict.items():
 | 
				
			||||||
 | 
					            if value == 1.0:
 | 
				
			||||||
 | 
					                true_kb_ids.append(key)
 | 
				
			||||||
 | 
					        if len(true_kb_ids) > 1:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E980)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if len(true_kb_ids) == 1:
 | 
				
			||||||
 | 
					            start_char, end_char = index
 | 
				
			||||||
 | 
					            start_token = starts.get(start_char)
 | 
				
			||||||
 | 
					            end_token = ends.get(end_char)
 | 
				
			||||||
 | 
					            for i in range(start_token, end_token+1):
 | 
				
			||||||
 | 
					                ent_kb_ids[i] = true_kb_ids[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return ent_kb_ids
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _guess_spaces(text, words):
 | 
				
			||||||
 | 
					    if text is None:
 | 
				
			||||||
 | 
					        return [True] * len(words)
 | 
				
			||||||
 | 
					    spaces = []
 | 
				
			||||||
 | 
					    text_pos = 0
 | 
				
			||||||
 | 
					    # align words with text
 | 
				
			||||||
 | 
					    for word in words:
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            word_start = text[text_pos:].index(word)
 | 
				
			||||||
 | 
					        except ValueError:
 | 
				
			||||||
 | 
					            spaces.append(True)
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        text_pos += word_start + len(word)
 | 
				
			||||||
 | 
					        if text_pos < len(text) and text[text_pos] == " ":
 | 
				
			||||||
 | 
					            spaces.append(True)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            spaces.append(False)
 | 
				
			||||||
 | 
					    return spaces
 | 
				
			||||||
							
								
								
									
										199
									
								
								spacy/gold/gold_io.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										199
									
								
								spacy/gold/gold_io.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,199 @@
 | 
				
			||||||
 | 
					import warnings
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..errors import Warnings
 | 
				
			||||||
 | 
					from ..tokens import Doc
 | 
				
			||||||
 | 
					from .iob_utils import biluo_tags_from_offsets, tags_to_entities
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
 | 
				
			||||||
 | 
					    """Convert a list of Doc objects into the JSON-serializable format used by
 | 
				
			||||||
 | 
					    the spacy train command.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    docs (iterable / Doc): The Doc object(s) to convert.
 | 
				
			||||||
 | 
					    doc_id (int): Id for the JSON.
 | 
				
			||||||
 | 
					    RETURNS (dict): The data in spaCy's JSON format
 | 
				
			||||||
 | 
					        - each input doc will be treated as a paragraph in the output doc
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if isinstance(docs, Doc):
 | 
				
			||||||
 | 
					        docs = [docs]
 | 
				
			||||||
 | 
					    json_doc = {"id": doc_id, "paragraphs": []}
 | 
				
			||||||
 | 
					    for i, doc in enumerate(docs):
 | 
				
			||||||
 | 
					        json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []}
 | 
				
			||||||
 | 
					        for cat, val in doc.cats.items():
 | 
				
			||||||
 | 
					            json_cat = {"label": cat, "value": val}
 | 
				
			||||||
 | 
					            json_para["cats"].append(json_cat)
 | 
				
			||||||
 | 
					        for ent in doc.ents:
 | 
				
			||||||
 | 
					            ent_tuple = (ent.start_char, ent.end_char, ent.label_)
 | 
				
			||||||
 | 
					            json_para["entities"].append(ent_tuple)
 | 
				
			||||||
 | 
					            if ent.kb_id_:
 | 
				
			||||||
 | 
					                link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
 | 
				
			||||||
 | 
					                json_para["links"].append(link_dict)
 | 
				
			||||||
 | 
					        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
 | 
				
			||||||
 | 
					        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
 | 
				
			||||||
 | 
					        for j, sent in enumerate(doc.sents):
 | 
				
			||||||
 | 
					            json_sent = {"tokens": [], "brackets": []}
 | 
				
			||||||
 | 
					            for token in sent:
 | 
				
			||||||
 | 
					                json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
 | 
				
			||||||
 | 
					                if doc.is_tagged:
 | 
				
			||||||
 | 
					                    json_token["tag"] = token.tag_
 | 
				
			||||||
 | 
					                    json_token["pos"] = token.pos_
 | 
				
			||||||
 | 
					                    json_token["morph"] = token.morph_
 | 
				
			||||||
 | 
					                    json_token["lemma"] = token.lemma_
 | 
				
			||||||
 | 
					                if doc.is_parsed:
 | 
				
			||||||
 | 
					                    json_token["head"] = token.head.i-token.i
 | 
				
			||||||
 | 
					                    json_token["dep"] = token.dep_
 | 
				
			||||||
 | 
					                json_sent["tokens"].append(json_token)
 | 
				
			||||||
 | 
					            json_para["sentences"].append(json_sent)
 | 
				
			||||||
 | 
					        json_doc["paragraphs"].append(json_para)
 | 
				
			||||||
 | 
					    return json_doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def read_json_file(loc, docs_filter=None, limit=None):
 | 
				
			||||||
 | 
					    """Read Example dictionaries from a json file or directory."""
 | 
				
			||||||
 | 
					    loc = util.ensure_path(loc)
 | 
				
			||||||
 | 
					    if loc.is_dir():
 | 
				
			||||||
 | 
					        for filename in loc.iterdir():
 | 
				
			||||||
 | 
					            yield from read_json_file(loc / filename, limit=limit)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        with loc.open("rb") as file_:
 | 
				
			||||||
 | 
					            utf8_str = file_.read()
 | 
				
			||||||
 | 
					        for json_doc in json_iterate(utf8_str):
 | 
				
			||||||
 | 
					            if docs_filter is not None and not docs_filter(json_doc):
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            for json_paragraph in json_to_annotations(json_doc):
 | 
				
			||||||
 | 
					                yield json_paragraph
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def json_to_annotations(doc):
 | 
				
			||||||
 | 
					    """Convert an item in the JSON-formatted training data to the format
 | 
				
			||||||
 | 
					    used by Example.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc (dict): One entry in the training data.
 | 
				
			||||||
 | 
					    YIELDS (tuple): The reformatted data - one training example per paragraph
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    for paragraph in doc["paragraphs"]:
 | 
				
			||||||
 | 
					        example = {"text": paragraph.get("raw", None)}
 | 
				
			||||||
 | 
					        words = []
 | 
				
			||||||
 | 
					        spaces = []
 | 
				
			||||||
 | 
					        ids = []
 | 
				
			||||||
 | 
					        tags = []
 | 
				
			||||||
 | 
					        ner_tags = []
 | 
				
			||||||
 | 
					        pos = []
 | 
				
			||||||
 | 
					        morphs = []
 | 
				
			||||||
 | 
					        lemmas = []
 | 
				
			||||||
 | 
					        heads = []
 | 
				
			||||||
 | 
					        labels = []
 | 
				
			||||||
 | 
					        sent_starts = []
 | 
				
			||||||
 | 
					        brackets = []
 | 
				
			||||||
 | 
					        for sent in paragraph["sentences"]:
 | 
				
			||||||
 | 
					            sent_start_i = len(words)
 | 
				
			||||||
 | 
					            for i, token in enumerate(sent["tokens"]):
 | 
				
			||||||
 | 
					                words.append(token["orth"])
 | 
				
			||||||
 | 
					                spaces.append(token.get("space", None))
 | 
				
			||||||
 | 
					                ids.append(token.get('id', sent_start_i + i))
 | 
				
			||||||
 | 
					                tags.append(token.get("tag", None))
 | 
				
			||||||
 | 
					                pos.append(token.get("pos", None))
 | 
				
			||||||
 | 
					                morphs.append(token.get("morph", None))
 | 
				
			||||||
 | 
					                lemmas.append(token.get("lemma", None))
 | 
				
			||||||
 | 
					                if "head" in token:
 | 
				
			||||||
 | 
					                    heads.append(token["head"] + sent_start_i + i)
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    heads.append(None)
 | 
				
			||||||
 | 
					                if "dep" in token:
 | 
				
			||||||
 | 
					                    labels.append(token["dep"])
 | 
				
			||||||
 | 
					                    # Ensure ROOT label is case-insensitive
 | 
				
			||||||
 | 
					                    if labels[-1].lower() == "root":
 | 
				
			||||||
 | 
					                        labels[-1] = "ROOT"
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    labels.append(None)
 | 
				
			||||||
 | 
					                ner_tags.append(token.get("ner", None))
 | 
				
			||||||
 | 
					                if i == 0:
 | 
				
			||||||
 | 
					                    sent_starts.append(1)
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    sent_starts.append(0)
 | 
				
			||||||
 | 
					            if "brackets" in sent:
 | 
				
			||||||
 | 
					                brackets.extend((b["first"] + sent_start_i,
 | 
				
			||||||
 | 
					                                 b["last"] + sent_start_i, b["label"])
 | 
				
			||||||
 | 
					                                 for b in sent["brackets"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        example["token_annotation"] = dict(
 | 
				
			||||||
 | 
					            ids=ids,
 | 
				
			||||||
 | 
					            words=words,
 | 
				
			||||||
 | 
					            spaces=spaces,
 | 
				
			||||||
 | 
					            sent_starts=sent_starts,
 | 
				
			||||||
 | 
					            brackets=brackets
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        # avoid including dummy values that looks like gold info was present
 | 
				
			||||||
 | 
					        if any(tags):
 | 
				
			||||||
 | 
					            example["token_annotation"]["tags"] = tags
 | 
				
			||||||
 | 
					        if any(pos):
 | 
				
			||||||
 | 
					            example["token_annotation"]["pos"] = pos
 | 
				
			||||||
 | 
					        if any(morphs):
 | 
				
			||||||
 | 
					            example["token_annotation"]["morphs"] = morphs
 | 
				
			||||||
 | 
					        if any(lemmas):
 | 
				
			||||||
 | 
					            example["token_annotation"]["lemmas"] = lemmas
 | 
				
			||||||
 | 
					        if any(head is not None for head in heads):
 | 
				
			||||||
 | 
					            example["token_annotation"]["heads"] = heads
 | 
				
			||||||
 | 
					        if any(labels):
 | 
				
			||||||
 | 
					            example["token_annotation"]["deps"] = labels
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        cats = {}
 | 
				
			||||||
 | 
					        for cat in paragraph.get("cats", {}):
 | 
				
			||||||
 | 
					            cats[cat["label"]] = cat["value"]
 | 
				
			||||||
 | 
					        example["doc_annotation"] = dict(
 | 
				
			||||||
 | 
					            cats=cats,
 | 
				
			||||||
 | 
					            entities=ner_tags,
 | 
				
			||||||
 | 
					            links=paragraph.get("links", [])   # TODO: fix/test
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        yield example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def json_iterate(bytes utf8_str):
 | 
				
			||||||
 | 
					    # We should've made these files jsonl...But since we didn't, parse out
 | 
				
			||||||
 | 
					    # the docs one-by-one to reduce memory usage.
 | 
				
			||||||
 | 
					    # It's okay to read in the whole file -- just don't parse it into JSON.
 | 
				
			||||||
 | 
					    cdef long file_length = len(utf8_str)
 | 
				
			||||||
 | 
					    if file_length > 2 ** 30:
 | 
				
			||||||
 | 
					        warnings.warn(Warnings.W027.format(size=file_length))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    raw = <char*>utf8_str
 | 
				
			||||||
 | 
					    cdef int square_depth = 0
 | 
				
			||||||
 | 
					    cdef int curly_depth = 0
 | 
				
			||||||
 | 
					    cdef int inside_string = 0
 | 
				
			||||||
 | 
					    cdef int escape = 0
 | 
				
			||||||
 | 
					    cdef long start = -1
 | 
				
			||||||
 | 
					    cdef char c
 | 
				
			||||||
 | 
					    cdef char quote = ord('"')
 | 
				
			||||||
 | 
					    cdef char backslash = ord("\\")
 | 
				
			||||||
 | 
					    cdef char open_square = ord("[")
 | 
				
			||||||
 | 
					    cdef char close_square = ord("]")
 | 
				
			||||||
 | 
					    cdef char open_curly = ord("{")
 | 
				
			||||||
 | 
					    cdef char close_curly = ord("}")
 | 
				
			||||||
 | 
					    for i in range(file_length):
 | 
				
			||||||
 | 
					        c = raw[i]
 | 
				
			||||||
 | 
					        if escape:
 | 
				
			||||||
 | 
					            escape = False
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if c == backslash:
 | 
				
			||||||
 | 
					            escape = True
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if c == quote:
 | 
				
			||||||
 | 
					            inside_string = not inside_string
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if inside_string:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if c == open_square:
 | 
				
			||||||
 | 
					            square_depth += 1
 | 
				
			||||||
 | 
					        elif c == close_square:
 | 
				
			||||||
 | 
					            square_depth -= 1
 | 
				
			||||||
 | 
					        elif c == open_curly:
 | 
				
			||||||
 | 
					            if square_depth == 1 and curly_depth == 0:
 | 
				
			||||||
 | 
					                start = i
 | 
				
			||||||
 | 
					            curly_depth += 1
 | 
				
			||||||
 | 
					        elif c == close_curly:
 | 
				
			||||||
 | 
					            curly_depth -= 1
 | 
				
			||||||
 | 
					            if square_depth == 1 and curly_depth == 0:
 | 
				
			||||||
 | 
					                substr = utf8_str[start : i + 1].decode("utf8")
 | 
				
			||||||
 | 
					                yield srsly.json_loads(substr)
 | 
				
			||||||
 | 
					                start = -1
 | 
				
			||||||
							
								
								
									
										209
									
								
								spacy/gold/iob_utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										209
									
								
								spacy/gold/iob_utils.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,209 @@
 | 
				
			||||||
 | 
					import warnings
 | 
				
			||||||
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
 | 
					from ..tokens import Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def iob_to_biluo(tags):
 | 
				
			||||||
 | 
					    out = []
 | 
				
			||||||
 | 
					    tags = list(tags)
 | 
				
			||||||
 | 
					    while tags:
 | 
				
			||||||
 | 
					        out.extend(_consume_os(tags))
 | 
				
			||||||
 | 
					        out.extend(_consume_ent(tags))
 | 
				
			||||||
 | 
					    return out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def biluo_to_iob(tags):
 | 
				
			||||||
 | 
					    out = []
 | 
				
			||||||
 | 
					    for tag in tags:
 | 
				
			||||||
 | 
					        if tag is None:
 | 
				
			||||||
 | 
					            out.append(tag)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
 | 
				
			||||||
 | 
					            out.append(tag)
 | 
				
			||||||
 | 
					    return out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _consume_os(tags):
 | 
				
			||||||
 | 
					    while tags and tags[0] == "O":
 | 
				
			||||||
 | 
					        yield tags.pop(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _consume_ent(tags):
 | 
				
			||||||
 | 
					    if not tags:
 | 
				
			||||||
 | 
					        return []
 | 
				
			||||||
 | 
					    tag = tags.pop(0)
 | 
				
			||||||
 | 
					    target_in = "I" + tag[1:]
 | 
				
			||||||
 | 
					    target_last = "L" + tag[1:]
 | 
				
			||||||
 | 
					    length = 1
 | 
				
			||||||
 | 
					    while tags and tags[0] in {target_in, target_last}:
 | 
				
			||||||
 | 
					        length += 1
 | 
				
			||||||
 | 
					        tags.pop(0)
 | 
				
			||||||
 | 
					    label = tag[2:]
 | 
				
			||||||
 | 
					    if length == 1:
 | 
				
			||||||
 | 
					        if len(label) == 0:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E177.format(tag=tag))
 | 
				
			||||||
 | 
					        return ["U-" + label]
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        start = "B-" + label
 | 
				
			||||||
 | 
					        end = "L-" + label
 | 
				
			||||||
 | 
					        middle = [f"I-{label}" for _ in range(1, length - 1)]
 | 
				
			||||||
 | 
					        return [start] + middle + [end]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def biluo_tags_from_doc(doc, missing="O"):
 | 
				
			||||||
 | 
					    return biluo_tags_from_offsets(
 | 
				
			||||||
 | 
					        doc,
 | 
				
			||||||
 | 
					        [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
 | 
				
			||||||
 | 
					        missing=missing,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def biluo_tags_from_offsets(doc, entities, missing="O"):
 | 
				
			||||||
 | 
					    """Encode labelled spans into per-token tags, using the
 | 
				
			||||||
 | 
					    Begin/In/Last/Unit/Out scheme (BILUO).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc (Doc): The document that the entity offsets refer to. The output tags
 | 
				
			||||||
 | 
					        will refer to the token boundaries within the document.
 | 
				
			||||||
 | 
					    entities (iterable): A sequence of `(start, end, label)` triples. `start`
 | 
				
			||||||
 | 
					        and `end` should be character-offset integers denoting the slice into
 | 
				
			||||||
 | 
					        the original string.
 | 
				
			||||||
 | 
					    RETURNS (list): A list of unicode strings, describing the tags. Each tag
 | 
				
			||||||
 | 
					        string will be of the form either "", "O" or "{action}-{label}", where
 | 
				
			||||||
 | 
					        action is one of "B", "I", "L", "U". The string "-" is used where the
 | 
				
			||||||
 | 
					        entity offsets don't align with the tokenization in the `Doc` object.
 | 
				
			||||||
 | 
					        The training algorithm will view these as missing values. "O" denotes a
 | 
				
			||||||
 | 
					        non-entity token. "B" denotes the beginning of a multi-token entity,
 | 
				
			||||||
 | 
					        "I" the inside of an entity of three or more tokens, and "L" the end
 | 
				
			||||||
 | 
					        of an entity of two or more tokens. "U" denotes a single-token entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    EXAMPLE:
 | 
				
			||||||
 | 
					        >>> text = 'I like London.'
 | 
				
			||||||
 | 
					        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
 | 
				
			||||||
 | 
					        >>> doc = nlp.tokenizer(text)
 | 
				
			||||||
 | 
					        >>> tags = biluo_tags_from_offsets(doc, entities)
 | 
				
			||||||
 | 
					        >>> assert tags == ["O", "O", 'U-LOC', "O"]
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    # Ensure no overlapping entity labels exist
 | 
				
			||||||
 | 
					    tokens_in_ents = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    starts = {token.idx: token.i for token in doc}
 | 
				
			||||||
 | 
					    ends = {token.idx + len(token): token.i for token in doc}
 | 
				
			||||||
 | 
					    biluo = ["-" for _ in doc]
 | 
				
			||||||
 | 
					    # Handle entity cases
 | 
				
			||||||
 | 
					    for start_char, end_char, label in entities:
 | 
				
			||||||
 | 
					        if not label:
 | 
				
			||||||
 | 
					            for s in starts:   # account for many-to-one
 | 
				
			||||||
 | 
					                if s >= start_char and s < end_char:
 | 
				
			||||||
 | 
					                    biluo[starts[s]] = "O"
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            for token_index in range(start_char, end_char):
 | 
				
			||||||
 | 
					                if token_index in tokens_in_ents.keys():
 | 
				
			||||||
 | 
					                    raise ValueError(
 | 
				
			||||||
 | 
					                        Errors.E103.format(
 | 
				
			||||||
 | 
					                            span1=(
 | 
				
			||||||
 | 
					                                tokens_in_ents[token_index][0],
 | 
				
			||||||
 | 
					                                tokens_in_ents[token_index][1],
 | 
				
			||||||
 | 
					                                tokens_in_ents[token_index][2],
 | 
				
			||||||
 | 
					                            ),
 | 
				
			||||||
 | 
					                            span2=(start_char, end_char, label),
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                tokens_in_ents[token_index] = (start_char, end_char, label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            start_token = starts.get(start_char)
 | 
				
			||||||
 | 
					            end_token = ends.get(end_char)
 | 
				
			||||||
 | 
					            # Only interested if the tokenization is correct
 | 
				
			||||||
 | 
					            if start_token is not None and end_token is not None:
 | 
				
			||||||
 | 
					                if start_token == end_token:
 | 
				
			||||||
 | 
					                    biluo[start_token] = f"U-{label}"
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    biluo[start_token] = f"B-{label}"
 | 
				
			||||||
 | 
					                    for i in range(start_token + 1, end_token):
 | 
				
			||||||
 | 
					                        biluo[i] = f"I-{label}"
 | 
				
			||||||
 | 
					                    biluo[end_token] = f"L-{label}"
 | 
				
			||||||
 | 
					    # Now distinguish the O cases from ones where we miss the tokenization
 | 
				
			||||||
 | 
					    entity_chars = set()
 | 
				
			||||||
 | 
					    for start_char, end_char, label in entities:
 | 
				
			||||||
 | 
					        for i in range(start_char, end_char):
 | 
				
			||||||
 | 
					            entity_chars.add(i)
 | 
				
			||||||
 | 
					    for token in doc:
 | 
				
			||||||
 | 
					        for i in range(token.idx, token.idx + len(token)):
 | 
				
			||||||
 | 
					            if i in entity_chars:
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            biluo[token.i] = missing
 | 
				
			||||||
 | 
					    if "-" in biluo and missing != "-":
 | 
				
			||||||
 | 
					        ent_str = str(entities)
 | 
				
			||||||
 | 
					        warnings.warn(
 | 
				
			||||||
 | 
					            Warnings.W030.format(
 | 
				
			||||||
 | 
					                text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
 | 
				
			||||||
 | 
					                entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    return biluo
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def spans_from_biluo_tags(doc, tags):
 | 
				
			||||||
 | 
					    """Encode per-token tags following the BILUO scheme into Span object, e.g.
 | 
				
			||||||
 | 
					    to overwrite the doc.ents.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc (Doc): The document that the BILUO tags refer to.
 | 
				
			||||||
 | 
					    entities (iterable): A sequence of BILUO tags with each tag describing one
 | 
				
			||||||
 | 
					        token. Each tags string will be of the form of either "", "O" or
 | 
				
			||||||
 | 
					        "{action}-{label}", where action is one of "B", "I", "L", "U".
 | 
				
			||||||
 | 
					    RETURNS (list): A sequence of Span objects.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    token_offsets = tags_to_entities(tags)
 | 
				
			||||||
 | 
					    spans = []
 | 
				
			||||||
 | 
					    for label, start_idx, end_idx in token_offsets:
 | 
				
			||||||
 | 
					        span = Span(doc, start_idx, end_idx + 1, label=label)
 | 
				
			||||||
 | 
					        spans.append(span)
 | 
				
			||||||
 | 
					    return spans
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def offsets_from_biluo_tags(doc, tags):
 | 
				
			||||||
 | 
					    """Encode per-token tags following the BILUO scheme into entity offsets.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc (Doc): The document that the BILUO tags refer to.
 | 
				
			||||||
 | 
					    entities (iterable): A sequence of BILUO tags with each tag describing one
 | 
				
			||||||
 | 
					        token. Each tags string will be of the form of either "", "O" or
 | 
				
			||||||
 | 
					        "{action}-{label}", where action is one of "B", "I", "L", "U".
 | 
				
			||||||
 | 
					    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
 | 
				
			||||||
 | 
					        `end` will be character-offset integers denoting the slice into the
 | 
				
			||||||
 | 
					        original string.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    spans = spans_from_biluo_tags(doc, tags)
 | 
				
			||||||
 | 
					    return [(span.start_char, span.end_char, span.label_) for span in spans]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def tags_to_entities(tags):
 | 
				
			||||||
 | 
					    """ Note that the end index returned by this function is inclusive.
 | 
				
			||||||
 | 
					    To use it for Span creation, increment the end by 1."""
 | 
				
			||||||
 | 
					    entities = []
 | 
				
			||||||
 | 
					    start = None
 | 
				
			||||||
 | 
					    for i, tag in enumerate(tags):
 | 
				
			||||||
 | 
					        if tag is None:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if tag.startswith("O"):
 | 
				
			||||||
 | 
					            # TODO: We shouldn't be getting these malformed inputs. Fix this.
 | 
				
			||||||
 | 
					            if start is not None:
 | 
				
			||||||
 | 
					                start = None
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                entities.append(("", i, i))
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        elif tag == "-":
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        elif tag.startswith("I"):
 | 
				
			||||||
 | 
					            if start is None:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E067.format(tags=tags[: i + 1]))
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if tag.startswith("U"):
 | 
				
			||||||
 | 
					            entities.append((tag[2:], i, i))
 | 
				
			||||||
 | 
					        elif tag.startswith("B"):
 | 
				
			||||||
 | 
					            start = i
 | 
				
			||||||
 | 
					        elif tag.startswith("L"):
 | 
				
			||||||
 | 
					            entities.append((tag[2:], start, i))
 | 
				
			||||||
 | 
					            start = None
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E068.format(tag=tag))
 | 
				
			||||||
 | 
					    return entities
 | 
				
			||||||
| 
						 | 
					@ -529,6 +529,22 @@ class Language(object):
 | 
				
			||||||
    def make_doc(self, text):
 | 
					    def make_doc(self, text):
 | 
				
			||||||
        return self.tokenizer(text)
 | 
					        return self.tokenizer(text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _convert_examples(self, examples):
 | 
				
			||||||
 | 
					        converted_examples = []
 | 
				
			||||||
 | 
					        if isinstance(examples, tuple):
 | 
				
			||||||
 | 
					            examples = [examples]
 | 
				
			||||||
 | 
					        for eg in examples:
 | 
				
			||||||
 | 
					            if isinstance(eg, Example):
 | 
				
			||||||
 | 
					                converted_examples.append(eg.copy())
 | 
				
			||||||
 | 
					            elif isinstance(eg, tuple):
 | 
				
			||||||
 | 
					                doc, annot = eg
 | 
				
			||||||
 | 
					                if isinstance(doc, str):
 | 
				
			||||||
 | 
					                    doc = self.make_doc(doc)
 | 
				
			||||||
 | 
					                converted_examples.append(Example.from_dict(doc, annot))
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E979.format(type=type(eg)))
 | 
				
			||||||
 | 
					        return converted_examples
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def update(
 | 
					    def update(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
        examples,
 | 
					        examples,
 | 
				
			||||||
| 
						 | 
					@ -556,7 +572,7 @@ class Language(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if len(examples) == 0:
 | 
					        if len(examples) == 0:
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
        examples = Example.to_example_objects(examples, make_doc=self.make_doc)
 | 
					        examples = self._convert_examples(examples)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if sgd is None:
 | 
					        if sgd is None:
 | 
				
			||||||
            if self._optimizer is None:
 | 
					            if self._optimizer is None:
 | 
				
			||||||
| 
						 | 
					@ -604,7 +620,7 @@ class Language(object):
 | 
				
			||||||
        # TODO: document
 | 
					        # TODO: document
 | 
				
			||||||
        if len(examples) == 0:
 | 
					        if len(examples) == 0:
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
        examples = Example.to_example_objects(examples, make_doc=self.make_doc)
 | 
					        examples = self._convert_examples(examples)
 | 
				
			||||||
        if sgd is None:
 | 
					        if sgd is None:
 | 
				
			||||||
            if self._optimizer is None:
 | 
					            if self._optimizer is None:
 | 
				
			||||||
                self._optimizer = create_default_optimizer()
 | 
					                self._optimizer = create_default_optimizer()
 | 
				
			||||||
| 
						 | 
					@ -632,19 +648,6 @@ class Language(object):
 | 
				
			||||||
            sgd(W, dW, key=key)
 | 
					            sgd(W, dW, key=key)
 | 
				
			||||||
        return losses
 | 
					        return losses
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def preprocess_gold(self, examples):
 | 
					 | 
				
			||||||
        """Can be called before training to pre-process gold data. By default,
 | 
					 | 
				
			||||||
        it handles nonprojectivity and adds missing tags to the tag map.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        examples (iterable): `Example` objects.
 | 
					 | 
				
			||||||
        YIELDS (tuple): `Example` objects.
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        for name, proc in self.pipeline:
 | 
					 | 
				
			||||||
            if hasattr(proc, "preprocess_gold"):
 | 
					 | 
				
			||||||
                examples = proc.preprocess_gold(examples)
 | 
					 | 
				
			||||||
        for ex in examples:
 | 
					 | 
				
			||||||
            yield ex
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
 | 
					    def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
 | 
				
			||||||
        """Allocate models, pre-process training data and acquire a trainer and
 | 
					        """Allocate models, pre-process training data and acquire a trainer and
 | 
				
			||||||
        optimizer. Used as a contextmanager.
 | 
					        optimizer. Used as a contextmanager.
 | 
				
			||||||
| 
						 | 
					@ -662,7 +665,7 @@ class Language(object):
 | 
				
			||||||
        # Populate vocab
 | 
					        # Populate vocab
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            for example in get_examples():
 | 
					            for example in get_examples():
 | 
				
			||||||
                for word in example.token_annotation.words:
 | 
					                for word in [t.text for t in example.reference]:
 | 
				
			||||||
                    _ = self.vocab[word]  # noqa: F841
 | 
					                    _ = self.vocab[word]  # noqa: F841
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if cfg.get("device", -1) >= 0:
 | 
					        if cfg.get("device", -1) >= 0:
 | 
				
			||||||
| 
						 | 
					@ -725,24 +728,26 @@ class Language(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/language#evaluate
 | 
					        DOCS: https://spacy.io/api/language#evaluate
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        examples = Example.to_example_objects(examples, make_doc=self.make_doc)
 | 
					        examples = self._convert_examples(examples)
 | 
				
			||||||
        if scorer is None:
 | 
					        if scorer is None:
 | 
				
			||||||
            scorer = Scorer(pipeline=self.pipeline)
 | 
					            scorer = Scorer(pipeline=self.pipeline)
 | 
				
			||||||
        if component_cfg is None:
 | 
					        if component_cfg is None:
 | 
				
			||||||
            component_cfg = {}
 | 
					            component_cfg = {}
 | 
				
			||||||
 | 
					        docs = list(eg.predicted for eg in examples)
 | 
				
			||||||
        for name, pipe in self.pipeline:
 | 
					        for name, pipe in self.pipeline:
 | 
				
			||||||
            kwargs = component_cfg.get(name, {})
 | 
					            kwargs = component_cfg.get(name, {})
 | 
				
			||||||
            kwargs.setdefault("batch_size", batch_size)
 | 
					            kwargs.setdefault("batch_size", batch_size)
 | 
				
			||||||
            if not hasattr(pipe, "pipe"):
 | 
					            if not hasattr(pipe, "pipe"):
 | 
				
			||||||
                examples = _pipe(examples, pipe, kwargs)
 | 
					                docs = _pipe(docs, pipe, kwargs)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                examples = pipe.pipe(examples, as_example=True, **kwargs)
 | 
					                docs = pipe.pipe(docs, **kwargs)
 | 
				
			||||||
        for ex in examples:
 | 
					        for i, (doc, eg) in enumerate(zip(docs, examples)):
 | 
				
			||||||
            if verbose:
 | 
					            if verbose:
 | 
				
			||||||
                print(ex.doc)
 | 
					                print(doc)
 | 
				
			||||||
 | 
					            eg.predicted = doc
 | 
				
			||||||
            kwargs = component_cfg.get("scorer", {})
 | 
					            kwargs = component_cfg.get("scorer", {})
 | 
				
			||||||
            kwargs.setdefault("verbose", verbose)
 | 
					            kwargs.setdefault("verbose", verbose)
 | 
				
			||||||
            scorer.score(ex, **kwargs)
 | 
					            scorer.score(eg, **kwargs)
 | 
				
			||||||
        return scorer
 | 
					        return scorer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @contextmanager
 | 
					    @contextmanager
 | 
				
			||||||
| 
						 | 
					@ -787,7 +792,6 @@ class Language(object):
 | 
				
			||||||
        cleanup=False,
 | 
					        cleanup=False,
 | 
				
			||||||
        component_cfg=None,
 | 
					        component_cfg=None,
 | 
				
			||||||
        n_process=1,
 | 
					        n_process=1,
 | 
				
			||||||
        as_example=False,
 | 
					 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        """Process texts as a stream, and yield `Doc` objects in order.
 | 
					        """Process texts as a stream, and yield `Doc` objects in order.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -821,7 +825,6 @@ class Language(object):
 | 
				
			||||||
                disable=disable,
 | 
					                disable=disable,
 | 
				
			||||||
                n_process=n_process,
 | 
					                n_process=n_process,
 | 
				
			||||||
                component_cfg=component_cfg,
 | 
					                component_cfg=component_cfg,
 | 
				
			||||||
                as_example=as_example,
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            for doc, context in zip(docs, contexts):
 | 
					            for doc, context in zip(docs, contexts):
 | 
				
			||||||
                yield (doc, context)
 | 
					                yield (doc, context)
 | 
				
			||||||
| 
						 | 
					@ -1210,9 +1213,9 @@ def _pipe(examples, proc, kwargs):
 | 
				
			||||||
    for arg in ["n_threads", "batch_size"]:
 | 
					    for arg in ["n_threads", "batch_size"]:
 | 
				
			||||||
        if arg in kwargs:
 | 
					        if arg in kwargs:
 | 
				
			||||||
            kwargs.pop(arg)
 | 
					            kwargs.pop(arg)
 | 
				
			||||||
    for ex in examples:
 | 
					    for eg in examples:
 | 
				
			||||||
        ex = proc(ex, **kwargs)
 | 
					        eg = proc(eg, **kwargs)
 | 
				
			||||||
        yield ex
 | 
					        yield eg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
 | 
					def _apply_pipes(make_doc, pipes, receiver, sender, underscore_state):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -80,13 +80,12 @@ def _get_transition_table(
 | 
				
			||||||
    B_start, B_end = (0, n_labels)
 | 
					    B_start, B_end = (0, n_labels)
 | 
				
			||||||
    I_start, I_end = (B_end, B_end + n_labels)
 | 
					    I_start, I_end = (B_end, B_end + n_labels)
 | 
				
			||||||
    L_start, L_end = (I_end, I_end + n_labels)
 | 
					    L_start, L_end = (I_end, I_end + n_labels)
 | 
				
			||||||
    U_start, U_end = (L_end, L_end + n_labels)
 | 
					    U_start, _ = (L_end, L_end + n_labels)
 | 
				
			||||||
    # Using ranges allows us to set specific cells, which is necessary to express
 | 
					    # Using ranges allows us to set specific cells, which is necessary to express
 | 
				
			||||||
    # that only actions of the same label are valid continuations.
 | 
					    # that only actions of the same label are valid continuations.
 | 
				
			||||||
    B_range = numpy.arange(B_start, B_end)
 | 
					    B_range = numpy.arange(B_start, B_end)
 | 
				
			||||||
    I_range = numpy.arange(I_start, I_end)
 | 
					    I_range = numpy.arange(I_start, I_end)
 | 
				
			||||||
    L_range = numpy.arange(L_start, L_end)
 | 
					    L_range = numpy.arange(L_start, L_end)
 | 
				
			||||||
    O_action = U_end
 | 
					 | 
				
			||||||
    # If this is the last token and the previous action was B or I, only L
 | 
					    # If this is the last token and the previous action was B or I, only L
 | 
				
			||||||
    # of that label is valid
 | 
					    # of that label is valid
 | 
				
			||||||
    table[1, B_range, L_range] = 1
 | 
					    table[1, B_range, L_range] = 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -48,8 +48,7 @@ def forward(model, X, is_train):
 | 
				
			||||||
        model.inc_grad("b", dY.sum(axis=0))
 | 
					        model.inc_grad("b", dY.sum(axis=0))
 | 
				
			||||||
        dY = dY.reshape((dY.shape[0], nO * nP))
 | 
					        dY = dY.reshape((dY.shape[0], nO * nP))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Wopfi = W.transpose((1, 2, 0, 3))
 | 
					        Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3)))
 | 
				
			||||||
        Wopfi = model.ops.xp.ascontiguousarray(Wopfi)
 | 
					 | 
				
			||||||
        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
 | 
					        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
 | 
				
			||||||
        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
 | 
					        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -59,7 +58,8 @@ def forward(model, X, is_train):
 | 
				
			||||||
        model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
 | 
					        model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
 | 
				
			||||||
        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
 | 
					        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
 | 
				
			||||||
        # (o, p, f, i) --> (f, o, p, i)
 | 
					        # (o, p, f, i) --> (f, o, p, i)
 | 
				
			||||||
        model.inc_grad("W", dWopfi.transpose((2, 0, 1, 3)))
 | 
					        dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3)))
 | 
				
			||||||
 | 
					        model.inc_grad("W", dWopfi)
 | 
				
			||||||
        return dXf.reshape((dXf.shape[0], nF, nI))
 | 
					        return dXf.reshape((dXf.shape[0], nF, nI))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return Yf, backward
 | 
					    return Yf, backward
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -48,9 +48,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
 | 
				
			||||||
    def mlm_forward(model, docs, is_train):
 | 
					    def mlm_forward(model, docs, is_train):
 | 
				
			||||||
        mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
 | 
					        mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
 | 
				
			||||||
        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
 | 
					        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
 | 
				
			||||||
        output, backprop = model.get_ref("wrapped-model").begin_update(
 | 
					        output, backprop = model.get_ref("wrapped-model").begin_update(docs)
 | 
				
			||||||
            docs
 | 
					 | 
				
			||||||
        )  # drop=drop
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def mlm_backward(d_output):
 | 
					        def mlm_backward(d_output):
 | 
				
			||||||
            d_output *= 1 - mask
 | 
					            d_output *= 1 - mask
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,6 @@
 | 
				
			||||||
from pydantic import StrictInt
 | 
					from pydantic import StrictInt
 | 
				
			||||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array
 | 
					from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array
 | 
				
			||||||
 | 
					from thinc.api import LayerNorm, Maxout, Mish
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...util import registry
 | 
					from ...util import registry
 | 
				
			||||||
from .._precomputable_affine import PrecomputableAffine
 | 
					from .._precomputable_affine import PrecomputableAffine
 | 
				
			||||||
| 
						 | 
					@ -16,7 +17,11 @@ def build_tb_parser_model(
 | 
				
			||||||
    nO=None,
 | 
					    nO=None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
 | 
					    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
 | 
				
			||||||
    tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),)
 | 
					    tok2vec = chain(
 | 
				
			||||||
 | 
					        tok2vec,
 | 
				
			||||||
 | 
					        list2array(),
 | 
				
			||||||
 | 
					        Linear(hidden_width, t2v_width),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    tok2vec.set_dim("nO", hidden_width)
 | 
					    tok2vec.set_dim("nO", hidden_width)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    lower = PrecomputableAffine(
 | 
					    lower = PrecomputableAffine(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,30 @@
 | 
				
			||||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 | 
					from thinc.api import (
 | 
				
			||||||
from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout
 | 
					    Model,
 | 
				
			||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout
 | 
					    reduce_mean,
 | 
				
			||||||
from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed
 | 
					    Linear,
 | 
				
			||||||
from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor
 | 
					    list2ragged,
 | 
				
			||||||
 | 
					    Logistic,
 | 
				
			||||||
 | 
					    ParametricAttention,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from thinc.api import chain, concatenate, clone, Dropout
 | 
				
			||||||
 | 
					from thinc.api import (
 | 
				
			||||||
 | 
					    SparseLinear,
 | 
				
			||||||
 | 
					    Softmax,
 | 
				
			||||||
 | 
					    softmax_activation,
 | 
				
			||||||
 | 
					    Maxout,
 | 
				
			||||||
 | 
					    reduce_sum,
 | 
				
			||||||
 | 
					    Relu,
 | 
				
			||||||
 | 
					    residual,
 | 
				
			||||||
 | 
					    expand_window,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from thinc.api import (
 | 
				
			||||||
 | 
					    HashEmbed,
 | 
				
			||||||
 | 
					    with_ragged,
 | 
				
			||||||
 | 
					    with_array,
 | 
				
			||||||
 | 
					    with_cpu,
 | 
				
			||||||
 | 
					    uniqued,
 | 
				
			||||||
 | 
					    FeatureExtractor,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..spacy_vectors import SpacyVectors
 | 
					from ..spacy_vectors import SpacyVectors
 | 
				
			||||||
from ... import util
 | 
					from ... import util
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -147,7 +147,7 @@ def hash_char_embed_bilstm_v1(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
 | 
					@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
 | 
				
			||||||
def LayerNormalizedMaxout(width, maxout_pieces):
 | 
					def LayerNormalizedMaxout(width, maxout_pieces):
 | 
				
			||||||
    return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,)
 | 
					    return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.architectures.register("spacy.MultiHashEmbed.v1")
 | 
					@registry.architectures.register("spacy.MultiHashEmbed.v1")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,8 +38,9 @@ def forward(model, X, is_train):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def init(model, X=None, Y=None):
 | 
					def init(model, X=None, Y=None):
 | 
				
			||||||
    tok2vec = model.get_ref("tok2vec").initialize(X=X)
 | 
					    model.get_ref("tok2vec").initialize(X=X)
 | 
				
			||||||
    lower = model.get_ref("lower").initialize()
 | 
					    lower = model.get_ref("lower")
 | 
				
			||||||
 | 
					    lower.initialize()
 | 
				
			||||||
    if model.attrs["has_upper"]:
 | 
					    if model.attrs["has_upper"]:
 | 
				
			||||||
        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
 | 
					        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
 | 
				
			||||||
        model.get_ref("upper").initialize(X=statevecs)
 | 
					        model.get_ref("upper").initialize(X=statevecs)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -51,9 +51,9 @@ class Morphologizer(Tagger):
 | 
				
			||||||
    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
 | 
					    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
 | 
				
			||||||
                       **kwargs):
 | 
					                       **kwargs):
 | 
				
			||||||
        for example in get_examples():
 | 
					        for example in get_examples():
 | 
				
			||||||
            for i, morph in enumerate(example.token_annotation.morphs):
 | 
					            for i, token in enumerate(example.reference):
 | 
				
			||||||
                pos = example.token_annotation.get_pos(i)
 | 
					                pos = token.pos_
 | 
				
			||||||
                morph = Morphology.feats_to_dict(morph)
 | 
					                morph = token.morph
 | 
				
			||||||
                norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
 | 
					                norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
 | 
				
			||||||
                if pos:
 | 
					                if pos:
 | 
				
			||||||
                    morph["POS"] = pos
 | 
					                    morph["POS"] = pos
 | 
				
			||||||
| 
						 | 
					@ -91,11 +91,12 @@ class Morphologizer(Tagger):
 | 
				
			||||||
        correct = numpy.zeros((scores.shape[0],), dtype="i")
 | 
					        correct = numpy.zeros((scores.shape[0],), dtype="i")
 | 
				
			||||||
        guesses = scores.argmax(axis=1)
 | 
					        guesses = scores.argmax(axis=1)
 | 
				
			||||||
        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
 | 
					        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
 | 
				
			||||||
        for ex in examples:
 | 
					        for eg in examples:
 | 
				
			||||||
            gold = ex.gold
 | 
					            pos_tags = eg.get_aligned("POS", as_string=True)
 | 
				
			||||||
            for i in range(len(gold.morphs)):
 | 
					            morphs = eg.get_aligned("MORPH", as_string=True)
 | 
				
			||||||
                pos = gold.pos[i] if i < len(gold.pos) else ""
 | 
					            for i in range(len(morphs)):
 | 
				
			||||||
                morph = gold.morphs[i]
 | 
					                pos = pos_tags[i]
 | 
				
			||||||
 | 
					                morph = morphs[i]
 | 
				
			||||||
                feats = Morphology.feats_to_dict(morph)
 | 
					                feats = Morphology.feats_to_dict(morph)
 | 
				
			||||||
                if pos:
 | 
					                if pos:
 | 
				
			||||||
                    feats["POS"] = pos
 | 
					                    feats["POS"] = pos
 | 
				
			||||||
| 
						 | 
					@ -115,7 +116,7 @@ class Morphologizer(Tagger):
 | 
				
			||||||
        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
 | 
					        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
 | 
				
			||||||
        d_scores *= self.model.ops.asarray(known_labels)
 | 
					        d_scores *= self.model.ops.asarray(known_labels)
 | 
				
			||||||
        loss = (d_scores**2).sum()
 | 
					        loss = (d_scores**2).sum()
 | 
				
			||||||
        docs = [ex.doc for ex in examples]
 | 
					        docs = [eg.predicted for eg in examples]
 | 
				
			||||||
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
 | 
					        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
 | 
				
			||||||
        return float(loss), d_scores
 | 
					        return float(loss), d_scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,6 @@
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
from ast import literal_eval
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import CosineDistance, to_categorical, get_array_module
 | 
					from thinc.api import CosineDistance, to_categorical, get_array_module
 | 
				
			||||||
from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
 | 
					from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy
 | 
				
			||||||
| 
						 | 
					@ -20,7 +19,7 @@ from .defaults import default_nel, default_senter
 | 
				
			||||||
from .functions import merge_subtokens
 | 
					from .functions import merge_subtokens
 | 
				
			||||||
from ..language import Language, component
 | 
					from ..language import Language, component
 | 
				
			||||||
from ..syntax import nonproj
 | 
					from ..syntax import nonproj
 | 
				
			||||||
from ..gold import Example
 | 
					from ..gold.example import Example
 | 
				
			||||||
from ..attrs import POS, ID
 | 
					from ..attrs import POS, ID
 | 
				
			||||||
from ..util import link_vectors_to_models, create_default_optimizer
 | 
					from ..util import link_vectors_to_models, create_default_optimizer
 | 
				
			||||||
from ..parts_of_speech import X
 | 
					from ..parts_of_speech import X
 | 
				
			||||||
| 
						 | 
					@ -48,56 +47,39 @@ class Pipe(object):
 | 
				
			||||||
    def from_nlp(cls, nlp, model, **cfg):
 | 
					    def from_nlp(cls, nlp, model, **cfg):
 | 
				
			||||||
        return cls(nlp.vocab, model, **cfg)
 | 
					        return cls(nlp.vocab, model, **cfg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _get_doc(self, example):
 | 
					 | 
				
			||||||
        """ Use this method if the `example` can be both a Doc or an Example """
 | 
					 | 
				
			||||||
        if isinstance(example, Doc):
 | 
					 | 
				
			||||||
            return example
 | 
					 | 
				
			||||||
        return example.doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, vocab, model, **cfg):
 | 
					    def __init__(self, vocab, model, **cfg):
 | 
				
			||||||
        """Create a new pipe instance."""
 | 
					        """Create a new pipe instance."""
 | 
				
			||||||
        raise NotImplementedError
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, example):
 | 
					    def __call__(self, Doc doc):
 | 
				
			||||||
        """Apply the pipe to one document. The document is
 | 
					        """Apply the pipe to one document. The document is
 | 
				
			||||||
        modified in-place, and returned.
 | 
					        modified in-place, and returned.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Both __call__ and pipe should delegate to the `predict()`
 | 
					        Both __call__ and pipe should delegate to the `predict()`
 | 
				
			||||||
        and `set_annotations()` methods.
 | 
					        and `set_annotations()` methods.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        doc = self._get_doc(example)
 | 
					 | 
				
			||||||
        predictions = self.predict([doc])
 | 
					        predictions = self.predict([doc])
 | 
				
			||||||
        if isinstance(predictions, tuple) and len(predictions) == 2:
 | 
					        if isinstance(predictions, tuple) and len(predictions) == 2:
 | 
				
			||||||
            scores, tensors = predictions
 | 
					            scores, tensors = predictions
 | 
				
			||||||
            self.set_annotations([doc], scores, tensors=tensors)
 | 
					            self.set_annotations([doc], scores, tensors=tensors)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.set_annotations([doc], predictions)
 | 
					            self.set_annotations([doc], predictions)
 | 
				
			||||||
        if isinstance(example, Example):
 | 
					 | 
				
			||||||
            example.doc = doc
 | 
					 | 
				
			||||||
            return example
 | 
					 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
 | 
					    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
				
			||||||
        """Apply the pipe to a stream of documents.
 | 
					        """Apply the pipe to a stream of documents.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Both __call__ and pipe should delegate to the `predict()`
 | 
					        Both __call__ and pipe should delegate to the `predict()`
 | 
				
			||||||
        and `set_annotations()` methods.
 | 
					        and `set_annotations()` methods.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        for examples in util.minibatch(stream, size=batch_size):
 | 
					        for docs in util.minibatch(stream, size=batch_size):
 | 
				
			||||||
            docs = [self._get_doc(ex) for ex in examples]
 | 
					 | 
				
			||||||
            predictions = self.predict(docs)
 | 
					            predictions = self.predict(docs)
 | 
				
			||||||
            if isinstance(predictions, tuple) and len(tuple) == 2:
 | 
					            if isinstance(predictions, tuple) and len(tuple) == 2:
 | 
				
			||||||
                scores, tensors = predictions
 | 
					                scores, tensors = predictions
 | 
				
			||||||
                self.set_annotations(docs, scores, tensors=tensors)
 | 
					                self.set_annotations(docs, scores, tensors=tensors)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                self.set_annotations(docs, predictions)
 | 
					                self.set_annotations(docs, predictions)
 | 
				
			||||||
 | 
					            yield from docs
 | 
				
			||||||
            if as_example:
 | 
					 | 
				
			||||||
                for ex, doc in zip(examples, docs):
 | 
					 | 
				
			||||||
                    ex.doc = doc
 | 
					 | 
				
			||||||
                    yield ex
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                yield from docs
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def predict(self, docs):
 | 
					    def predict(self, docs):
 | 
				
			||||||
        """Apply the pipeline's model to a batch of docs, without
 | 
					        """Apply the pipeline's model to a batch of docs, without
 | 
				
			||||||
| 
						 | 
					@ -109,16 +91,6 @@ class Pipe(object):
 | 
				
			||||||
        """Modify a batch of documents, using pre-computed scores."""
 | 
					        """Modify a batch of documents, using pre-computed scores."""
 | 
				
			||||||
        raise NotImplementedError
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
 | 
					 | 
				
			||||||
        """Learn from a batch of documents and gold-standard information,
 | 
					 | 
				
			||||||
        updating the pipe's model.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Delegates to predict() and get_loss().
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        if set_annotations:
 | 
					 | 
				
			||||||
            docs = (self._get_doc(ex) for ex in examples)
 | 
					 | 
				
			||||||
            docs = list(self.pipe(docs))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def rehearse(self, examples, sgd=None, losses=None, **config):
 | 
					    def rehearse(self, examples, sgd=None, losses=None, **config):
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -255,29 +227,16 @@ class Tagger(Pipe):
 | 
				
			||||||
    def labels(self):
 | 
					    def labels(self):
 | 
				
			||||||
        return tuple(self.vocab.morphology.tag_names)
 | 
					        return tuple(self.vocab.morphology.tag_names)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, example):
 | 
					    def __call__(self, doc):
 | 
				
			||||||
        doc = self._get_doc(example)
 | 
					 | 
				
			||||||
        tags = self.predict([doc])
 | 
					        tags = self.predict([doc])
 | 
				
			||||||
        self.set_annotations([doc], tags)
 | 
					        self.set_annotations([doc], tags)
 | 
				
			||||||
        if isinstance(example, Example):
 | 
					 | 
				
			||||||
            example.doc = doc
 | 
					 | 
				
			||||||
            return example
 | 
					 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
 | 
					    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
				
			||||||
        for examples in util.minibatch(stream, size=batch_size):
 | 
					        for docs in util.minibatch(stream, size=batch_size):
 | 
				
			||||||
            docs = [self._get_doc(ex) for ex in examples]
 | 
					 | 
				
			||||||
            tag_ids = self.predict(docs)
 | 
					            tag_ids = self.predict(docs)
 | 
				
			||||||
            assert len(docs) == len(examples)
 | 
					 | 
				
			||||||
            assert len(tag_ids) == len(examples)
 | 
					 | 
				
			||||||
            self.set_annotations(docs, tag_ids)
 | 
					            self.set_annotations(docs, tag_ids)
 | 
				
			||||||
 | 
					            yield from docs
 | 
				
			||||||
            if as_example:
 | 
					 | 
				
			||||||
                for ex, doc in zip(examples, docs):
 | 
					 | 
				
			||||||
                    ex.doc = doc
 | 
					 | 
				
			||||||
                    yield ex
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                yield from docs
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def predict(self, docs):
 | 
					    def predict(self, docs):
 | 
				
			||||||
        if not any(len(doc) for doc in docs):
 | 
					        if not any(len(doc) for doc in docs):
 | 
				
			||||||
| 
						 | 
					@ -327,15 +286,19 @@ class Tagger(Pipe):
 | 
				
			||||||
            doc.is_tagged = True
 | 
					            doc.is_tagged = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
 | 
					    def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					 | 
				
			||||||
        if losses is not None and self.name not in losses:
 | 
					        if losses is not None and self.name not in losses:
 | 
				
			||||||
            losses[self.name] = 0.
 | 
					            losses[self.name] = 0.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
 | 
					        try:
 | 
				
			||||||
            # Handle cases where there are no tokens in any docs.
 | 
					            if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
 | 
				
			||||||
            return
 | 
					                # Handle cases where there are no tokens in any docs.
 | 
				
			||||||
 | 
					                return
 | 
				
			||||||
 | 
					        except AttributeError:
 | 
				
			||||||
 | 
					            types = set([type(eg) for eg in examples])
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E978.format(name="Tagger", method="update", types=types))
 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					        set_dropout_rate(self.model, drop)
 | 
				
			||||||
        tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
 | 
					        tag_scores, bp_tag_scores = self.model.begin_update(
 | 
				
			||||||
 | 
					            [eg.predicted for eg in examples])
 | 
				
			||||||
        for sc in tag_scores:
 | 
					        for sc in tag_scores:
 | 
				
			||||||
            if self.model.ops.xp.isnan(sc.sum()):
 | 
					            if self.model.ops.xp.isnan(sc.sum()):
 | 
				
			||||||
                raise ValueError("nan value in scores")
 | 
					                raise ValueError("nan value in scores")
 | 
				
			||||||
| 
						 | 
					@ -347,17 +310,20 @@ class Tagger(Pipe):
 | 
				
			||||||
        if losses is not None:
 | 
					        if losses is not None:
 | 
				
			||||||
            losses[self.name] += loss
 | 
					            losses[self.name] += loss
 | 
				
			||||||
        if set_annotations:
 | 
					        if set_annotations:
 | 
				
			||||||
            docs = [ex.doc for ex in examples]
 | 
					            docs = [eg.predicted for eg in examples]
 | 
				
			||||||
            self.set_annotations(docs, self._scores2guesses(tag_scores))
 | 
					            self.set_annotations(docs, self._scores2guesses(tag_scores))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def rehearse(self, examples, drop=0., sgd=None, losses=None):
 | 
					    def rehearse(self, examples, drop=0., sgd=None, losses=None):
 | 
				
			||||||
        """Perform a 'rehearsal' update, where we try to match the output of
 | 
					        """Perform a 'rehearsal' update, where we try to match the output of
 | 
				
			||||||
        an initial model.
 | 
					        an initial model.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            docs = [eg.predicted for eg in examples]
 | 
				
			||||||
 | 
					        except AttributeError:
 | 
				
			||||||
 | 
					            types = set([type(eg) for eg in examples])
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E978.format(name="Tagger", method="rehearse", types=types))
 | 
				
			||||||
        if self._rehearsal_model is None:
 | 
					        if self._rehearsal_model is None:
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					 | 
				
			||||||
        docs = [ex.doc for ex in examples]
 | 
					 | 
				
			||||||
        if not any(len(doc) for doc in docs):
 | 
					        if not any(len(doc) for doc in docs):
 | 
				
			||||||
            # Handle cases where there are no tokens in any docs.
 | 
					            # Handle cases where there are no tokens in any docs.
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
| 
						 | 
					@ -373,7 +339,7 @@ class Tagger(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_loss(self, examples, scores):
 | 
					    def get_loss(self, examples, scores):
 | 
				
			||||||
        loss_func = SequenceCategoricalCrossentropy(names=self.labels)
 | 
					        loss_func = SequenceCategoricalCrossentropy(names=self.labels)
 | 
				
			||||||
        truths = [eg.gold.tags for eg in examples]
 | 
					        truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
 | 
				
			||||||
        d_scores, loss = loss_func(scores, truths)
 | 
					        d_scores, loss = loss_func(scores, truths)
 | 
				
			||||||
        if self.model.ops.xp.isnan(loss):
 | 
					        if self.model.ops.xp.isnan(loss):
 | 
				
			||||||
            raise ValueError("nan value when computing loss")
 | 
					            raise ValueError("nan value when computing loss")
 | 
				
			||||||
| 
						 | 
					@ -389,7 +355,12 @@ class Tagger(Pipe):
 | 
				
			||||||
        orig_tag_map = dict(self.vocab.morphology.tag_map)
 | 
					        orig_tag_map = dict(self.vocab.morphology.tag_map)
 | 
				
			||||||
        new_tag_map = {}
 | 
					        new_tag_map = {}
 | 
				
			||||||
        for example in get_examples():
 | 
					        for example in get_examples():
 | 
				
			||||||
            for tag in example.token_annotation.tags:
 | 
					            try:
 | 
				
			||||||
 | 
					                y = example.y
 | 
				
			||||||
 | 
					            except AttributeError:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example)))
 | 
				
			||||||
 | 
					            for token in y:
 | 
				
			||||||
 | 
					                tag = token.tag_
 | 
				
			||||||
                if tag in orig_tag_map:
 | 
					                if tag in orig_tag_map:
 | 
				
			||||||
                    new_tag_map[tag] = orig_tag_map[tag]
 | 
					                    new_tag_map[tag] = orig_tag_map[tag]
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
| 
						 | 
					@ -564,9 +535,9 @@ class SentenceRecognizer(Tagger):
 | 
				
			||||||
        correct = numpy.zeros((scores.shape[0],), dtype="i")
 | 
					        correct = numpy.zeros((scores.shape[0],), dtype="i")
 | 
				
			||||||
        guesses = scores.argmax(axis=1)
 | 
					        guesses = scores.argmax(axis=1)
 | 
				
			||||||
        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
 | 
					        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
 | 
				
			||||||
        for ex in examples:
 | 
					        for eg in examples:
 | 
				
			||||||
            gold = ex.gold
 | 
					            sent_starts = eg.get_aligned("sent_start")
 | 
				
			||||||
            for sent_start in gold.sent_starts:
 | 
					            for sent_start in sent_starts:
 | 
				
			||||||
                if sent_start is None:
 | 
					                if sent_start is None:
 | 
				
			||||||
                    correct[idx] = guesses[idx]
 | 
					                    correct[idx] = guesses[idx]
 | 
				
			||||||
                elif sent_start in tag_index:
 | 
					                elif sent_start in tag_index:
 | 
				
			||||||
| 
						 | 
					@ -579,7 +550,7 @@ class SentenceRecognizer(Tagger):
 | 
				
			||||||
        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
 | 
					        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
 | 
				
			||||||
        d_scores *= self.model.ops.asarray(known_labels)
 | 
					        d_scores *= self.model.ops.asarray(known_labels)
 | 
				
			||||||
        loss = (d_scores**2).sum()
 | 
					        loss = (d_scores**2).sum()
 | 
				
			||||||
        docs = [ex.doc for ex in examples]
 | 
					        docs = [eg.predicted for eg in examples]
 | 
				
			||||||
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
 | 
					        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
 | 
				
			||||||
        return float(loss), d_scores
 | 
					        return float(loss), d_scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -690,8 +661,8 @@ class MultitaskObjective(Tagger):
 | 
				
			||||||
        gold_examples = nonproj.preprocess_training_data(get_examples())
 | 
					        gold_examples = nonproj.preprocess_training_data(get_examples())
 | 
				
			||||||
        # for raw_text, doc_annot in gold_tuples:
 | 
					        # for raw_text, doc_annot in gold_tuples:
 | 
				
			||||||
        for example in gold_examples:
 | 
					        for example in gold_examples:
 | 
				
			||||||
            for i in range(len(example.token_annotation.ids)):
 | 
					            for token in example.y:
 | 
				
			||||||
                label = self.make_label(i, example.token_annotation)
 | 
					                label = self.make_label(token)
 | 
				
			||||||
                if label is not None and label not in self.labels:
 | 
					                if label is not None and label not in self.labels:
 | 
				
			||||||
                    self.labels[label] = len(self.labels)
 | 
					                    self.labels[label] = len(self.labels)
 | 
				
			||||||
        self.model.initialize()
 | 
					        self.model.initialize()
 | 
				
			||||||
| 
						 | 
					@ -709,13 +680,13 @@ class MultitaskObjective(Tagger):
 | 
				
			||||||
        cdef int idx = 0
 | 
					        cdef int idx = 0
 | 
				
			||||||
        correct = numpy.zeros((scores.shape[0],), dtype="i")
 | 
					        correct = numpy.zeros((scores.shape[0],), dtype="i")
 | 
				
			||||||
        guesses = scores.argmax(axis=1)
 | 
					        guesses = scores.argmax(axis=1)
 | 
				
			||||||
        golds = [ex.gold for ex in examples]
 | 
					        docs = [eg.predicted for eg in examples]
 | 
				
			||||||
        docs = [ex.doc for ex in examples]
 | 
					        for i, eg in enumerate(examples):
 | 
				
			||||||
        for i, gold in enumerate(golds):
 | 
					            # Handles alignment for tokenization differences
 | 
				
			||||||
            for j in range(len(docs[i])):
 | 
					            doc_annots = eg.get_aligned()  # TODO
 | 
				
			||||||
                # Handels alignment for tokenization differences
 | 
					            for j in range(len(eg.predicted)):
 | 
				
			||||||
                token_annotation = gold.get_token_annotation()
 | 
					                tok_annots = {key: values[j] for key, values in tok_annots.items()}
 | 
				
			||||||
                label = self.make_label(j, token_annotation)
 | 
					                label = self.make_label(j, tok_annots)
 | 
				
			||||||
                if label is None or label not in self.labels:
 | 
					                if label is None or label not in self.labels:
 | 
				
			||||||
                    correct[idx] = guesses[idx]
 | 
					                    correct[idx] = guesses[idx]
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
| 
						 | 
					@ -727,83 +698,49 @@ class MultitaskObjective(Tagger):
 | 
				
			||||||
        return float(loss), d_scores
 | 
					        return float(loss), d_scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def make_dep(i, token_annotation):
 | 
					    def make_dep(token):
 | 
				
			||||||
        if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
 | 
					        return token.dep_
 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
        return token_annotation.deps[i]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def make_tag(i, token_annotation):
 | 
					    def make_tag(token):
 | 
				
			||||||
        return token_annotation.tags[i]
 | 
					        return token.tag_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def make_ent(i, token_annotation):
 | 
					    def make_ent(token):
 | 
				
			||||||
        if token_annotation.entities is None:
 | 
					        if token.ent_iob_ == "O":
 | 
				
			||||||
            return None
 | 
					            return "O"
 | 
				
			||||||
        return token_annotation.entities[i]
 | 
					        else:
 | 
				
			||||||
 | 
					            return token.ent_iob_ + "-" + token.ent_type_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def make_dep_tag_offset(i, token_annotation):
 | 
					    def make_dep_tag_offset(token):
 | 
				
			||||||
        if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
 | 
					        dep = token.dep_
 | 
				
			||||||
            return None
 | 
					        tag = token.tag_
 | 
				
			||||||
        offset = token_annotation.heads[i] - i
 | 
					        offset = token.head.i - token.i
 | 
				
			||||||
        offset = min(offset, 2)
 | 
					        offset = min(offset, 2)
 | 
				
			||||||
        offset = max(offset, -2)
 | 
					        offset = max(offset, -2)
 | 
				
			||||||
        return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}"
 | 
					        return f"{dep}-{tag}:{offset}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def make_ent_tag(i, token_annotation):
 | 
					    def make_ent_tag(token):
 | 
				
			||||||
        if token_annotation.entities is None or token_annotation.entities[i] is None:
 | 
					        if token.ent_iob_ == "O":
 | 
				
			||||||
            return None
 | 
					            ent = "O"
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}"
 | 
					            ent = token.ent_iob_ + "-" + token.ent_type_
 | 
				
			||||||
 | 
					        tag = token.tag_
 | 
				
			||||||
 | 
					        return f"{tag}-{ent}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def make_sent_start(target, token_annotation, cache=True, _cache={}):
 | 
					    def make_sent_start(token):
 | 
				
			||||||
        """A multi-task objective for representing sentence boundaries,
 | 
					        """A multi-task objective for representing sentence boundaries,
 | 
				
			||||||
        using BILU scheme. (O is impossible)
 | 
					        using BILU scheme. (O is impossible)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        The implementation of this method uses an internal cache that relies
 | 
					 | 
				
			||||||
        on the identity of the heads array, to avoid requiring a new piece
 | 
					 | 
				
			||||||
        of gold data. You can pass cache=False if you know the cache will
 | 
					 | 
				
			||||||
        do the wrong thing.
 | 
					 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        words = token_annotation.words
 | 
					        if token.is_sent_start and token.is_sent_end:
 | 
				
			||||||
        heads = token_annotation.heads
 | 
					            return "U-SENT"
 | 
				
			||||||
        assert len(words) == len(heads)
 | 
					        elif token.is_sent_start:
 | 
				
			||||||
        assert target < len(words), (target, len(words))
 | 
					            return "B-SENT"
 | 
				
			||||||
        if cache:
 | 
					 | 
				
			||||||
            if id(heads) in _cache:
 | 
					 | 
				
			||||||
                return _cache[id(heads)][target]
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                for key in list(_cache.keys()):
 | 
					 | 
				
			||||||
                    _cache.pop(key)
 | 
					 | 
				
			||||||
            sent_tags = ["I-SENT"] * len(words)
 | 
					 | 
				
			||||||
            _cache[id(heads)] = sent_tags
 | 
					 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            sent_tags = ["I-SENT"] * len(words)
 | 
					            return "I-SENT"
 | 
				
			||||||
 | 
					 | 
				
			||||||
        def _find_root(child):
 | 
					 | 
				
			||||||
            seen = set([child])
 | 
					 | 
				
			||||||
            while child is not None and heads[child] != child:
 | 
					 | 
				
			||||||
                seen.add(child)
 | 
					 | 
				
			||||||
                child = heads[child]
 | 
					 | 
				
			||||||
            return child
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        sentences = {}
 | 
					 | 
				
			||||||
        for i in range(len(words)):
 | 
					 | 
				
			||||||
            root = _find_root(i)
 | 
					 | 
				
			||||||
            if root is None:
 | 
					 | 
				
			||||||
                sent_tags[i] = None
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                sentences.setdefault(root, []).append(i)
 | 
					 | 
				
			||||||
        for root, span in sorted(sentences.items()):
 | 
					 | 
				
			||||||
            if len(span) == 1:
 | 
					 | 
				
			||||||
                sent_tags[span[0]] = "U-SENT"
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                sent_tags[span[0]] = "B-SENT"
 | 
					 | 
				
			||||||
                sent_tags[span[-1]] = "L-SENT"
 | 
					 | 
				
			||||||
        return sent_tags[target]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ClozeMultitask(Pipe):
 | 
					class ClozeMultitask(Pipe):
 | 
				
			||||||
| 
						 | 
					@ -836,7 +773,7 @@ class ClozeMultitask(Pipe):
 | 
				
			||||||
        # token.vector values, but that's a bit inefficient, especially on GPU.
 | 
					        # token.vector values, but that's a bit inefficient, especially on GPU.
 | 
				
			||||||
        # Instead we fetch the index into the vectors table for each of our tokens,
 | 
					        # Instead we fetch the index into the vectors table for each of our tokens,
 | 
				
			||||||
        # and look them up all at once. This prevents data copying.
 | 
					        # and look them up all at once. This prevents data copying.
 | 
				
			||||||
        ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
 | 
					        ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
 | 
				
			||||||
        target = vectors[ids]
 | 
					        target = vectors[ids]
 | 
				
			||||||
        gradient = self.distance.get_grad(prediction, target)
 | 
					        gradient = self.distance.get_grad(prediction, target)
 | 
				
			||||||
        loss = self.distance.get_loss(prediction, target)
 | 
					        loss = self.distance.get_loss(prediction, target)
 | 
				
			||||||
| 
						 | 
					@ -846,11 +783,14 @@ class ClozeMultitask(Pipe):
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def rehearse(self, examples, drop=0., sgd=None, losses=None):
 | 
					    def rehearse(self, examples, drop=0., sgd=None, losses=None):
 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					 | 
				
			||||||
        if losses is not None and self.name not in losses:
 | 
					        if losses is not None and self.name not in losses:
 | 
				
			||||||
            losses[self.name] = 0.
 | 
					            losses[self.name] = 0.
 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					        set_dropout_rate(self.model, drop)
 | 
				
			||||||
        predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples])
 | 
					        try:
 | 
				
			||||||
 | 
					            predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples])
 | 
				
			||||||
 | 
					        except AttributeError:
 | 
				
			||||||
 | 
					            types = set([type(eg) for eg in examples])
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types))
 | 
				
			||||||
        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
 | 
					        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
 | 
				
			||||||
        bp_predictions(d_predictions)
 | 
					        bp_predictions(d_predictions)
 | 
				
			||||||
        if sgd is not None:
 | 
					        if sgd is not None:
 | 
				
			||||||
| 
						 | 
					@ -885,18 +825,11 @@ class TextCategorizer(Pipe):
 | 
				
			||||||
    def labels(self, value):
 | 
					    def labels(self, value):
 | 
				
			||||||
        self.cfg["labels"] = tuple(value)
 | 
					        self.cfg["labels"] = tuple(value)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
 | 
					    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
				
			||||||
        for examples in util.minibatch(stream, size=batch_size):
 | 
					        for docs in util.minibatch(stream, size=batch_size):
 | 
				
			||||||
            docs = [self._get_doc(ex) for ex in examples]
 | 
					 | 
				
			||||||
            scores, tensors = self.predict(docs)
 | 
					            scores, tensors = self.predict(docs)
 | 
				
			||||||
            self.set_annotations(docs, scores, tensors=tensors)
 | 
					            self.set_annotations(docs, scores, tensors=tensors)
 | 
				
			||||||
 | 
					            yield from docs
 | 
				
			||||||
            if as_example:
 | 
					 | 
				
			||||||
                for ex, doc in zip(examples, docs):
 | 
					 | 
				
			||||||
                    ex.doc = doc
 | 
					 | 
				
			||||||
                    yield ex
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                yield from docs
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def predict(self, docs):
 | 
					    def predict(self, docs):
 | 
				
			||||||
        tensors = [doc.tensor for doc in docs]
 | 
					        tensors = [doc.tensor for doc in docs]
 | 
				
			||||||
| 
						 | 
					@ -917,12 +850,17 @@ class TextCategorizer(Pipe):
 | 
				
			||||||
                doc.cats[label] = float(scores[i, j])
 | 
					                doc.cats[label] = float(scores[i, j])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
 | 
					    def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					        try:
 | 
				
			||||||
        if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
 | 
					            if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
 | 
				
			||||||
            # Handle cases where there are no tokens in any docs.
 | 
					                # Handle cases where there are no tokens in any docs.
 | 
				
			||||||
            return
 | 
					                return
 | 
				
			||||||
 | 
					        except AttributeError:
 | 
				
			||||||
 | 
					            types = set([type(eg) for eg in examples])
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E978.format(name="TextCategorizer", method="update", types=types))
 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					        set_dropout_rate(self.model, drop)
 | 
				
			||||||
        scores, bp_scores = self.model.begin_update([ex.doc for ex in examples])
 | 
					        scores, bp_scores = self.model.begin_update(
 | 
				
			||||||
 | 
					            [eg.predicted for eg in examples]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        loss, d_scores = self.get_loss(examples, scores)
 | 
					        loss, d_scores = self.get_loss(examples, scores)
 | 
				
			||||||
        bp_scores(d_scores)
 | 
					        bp_scores(d_scores)
 | 
				
			||||||
        if sgd is not None:
 | 
					        if sgd is not None:
 | 
				
			||||||
| 
						 | 
					@ -931,14 +869,17 @@ class TextCategorizer(Pipe):
 | 
				
			||||||
            losses.setdefault(self.name, 0.0)
 | 
					            losses.setdefault(self.name, 0.0)
 | 
				
			||||||
            losses[self.name] += loss
 | 
					            losses[self.name] += loss
 | 
				
			||||||
        if set_annotations:
 | 
					        if set_annotations:
 | 
				
			||||||
            docs = [ex.doc for ex in examples]
 | 
					            docs = [eg.predicted for eg in examples]
 | 
				
			||||||
            self.set_annotations(docs, scores=scores)
 | 
					            self.set_annotations(docs, scores=scores)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def rehearse(self, examples, drop=0., sgd=None, losses=None):
 | 
					    def rehearse(self, examples, drop=0., sgd=None, losses=None):
 | 
				
			||||||
        if self._rehearsal_model is None:
 | 
					        if self._rehearsal_model is None:
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					        try:
 | 
				
			||||||
        docs=[ex.doc for ex in examples]
 | 
					            docs = [eg.predicted for eg in examples]
 | 
				
			||||||
 | 
					        except AttributeError:
 | 
				
			||||||
 | 
					            types = set([type(eg) for eg in examples])
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E978.format(name="TextCategorizer", method="rehearse", types=types))
 | 
				
			||||||
        if not any(len(doc) for doc in docs):
 | 
					        if not any(len(doc) for doc in docs):
 | 
				
			||||||
            # Handle cases where there are no tokens in any docs.
 | 
					            # Handle cases where there are no tokens in any docs.
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
| 
						 | 
					@ -954,13 +895,12 @@ class TextCategorizer(Pipe):
 | 
				
			||||||
            losses[self.name] += (gradient**2).sum()
 | 
					            losses[self.name] += (gradient**2).sum()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _examples_to_truth(self, examples):
 | 
					    def _examples_to_truth(self, examples):
 | 
				
			||||||
        gold_cats = [ex.doc_annotation.cats for ex in examples]
 | 
					        truths = numpy.zeros((len(examples), len(self.labels)), dtype="f")
 | 
				
			||||||
        truths = numpy.zeros((len(gold_cats), len(self.labels)), dtype="f")
 | 
					        not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
 | 
				
			||||||
        not_missing = numpy.ones((len(gold_cats), len(self.labels)), dtype="f")
 | 
					        for i, eg in enumerate(examples):
 | 
				
			||||||
        for i, gold_cat in enumerate(gold_cats):
 | 
					 | 
				
			||||||
            for j, label in enumerate(self.labels):
 | 
					            for j, label in enumerate(self.labels):
 | 
				
			||||||
                if label in gold_cat:
 | 
					                if label in eg.reference.cats:
 | 
				
			||||||
                    truths[i, j] = gold_cat[label]
 | 
					                    truths[i, j] = eg.reference.cats[label]
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    not_missing[i, j] = 0.
 | 
					                    not_missing[i, j] = 0.
 | 
				
			||||||
        truths = self.model.ops.asarray(truths)
 | 
					        truths = self.model.ops.asarray(truths)
 | 
				
			||||||
| 
						 | 
					@ -997,7 +937,11 @@ class TextCategorizer(Pipe):
 | 
				
			||||||
        # TODO: begin_training is not guaranteed to see all data / labels ?
 | 
					        # TODO: begin_training is not guaranteed to see all data / labels ?
 | 
				
			||||||
        examples = list(get_examples())
 | 
					        examples = list(get_examples())
 | 
				
			||||||
        for example in examples:
 | 
					        for example in examples:
 | 
				
			||||||
            for cat in example.doc_annotation.cats:
 | 
					            try:
 | 
				
			||||||
 | 
					                y = example.y
 | 
				
			||||||
 | 
					            except AttributeError:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E978.format(name="TextCategorizer", method="update", types=type(example)))
 | 
				
			||||||
 | 
					            for cat in y.cats:
 | 
				
			||||||
                self.add_label(cat)
 | 
					                self.add_label(cat)
 | 
				
			||||||
        self.require_labels()
 | 
					        self.require_labels()
 | 
				
			||||||
        docs = [Doc(Vocab(), words=["hello"])]
 | 
					        docs = [Doc(Vocab(), words=["hello"])]
 | 
				
			||||||
| 
						 | 
					@ -1156,65 +1100,52 @@ class EntityLinker(Pipe):
 | 
				
			||||||
            losses.setdefault(self.name, 0.0)
 | 
					            losses.setdefault(self.name, 0.0)
 | 
				
			||||||
        if not examples:
 | 
					        if not examples:
 | 
				
			||||||
            return 0
 | 
					            return 0
 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					 | 
				
			||||||
        sentence_docs = []
 | 
					        sentence_docs = []
 | 
				
			||||||
        docs = [ex.doc for ex in examples]
 | 
					        try:
 | 
				
			||||||
 | 
					            docs = [eg.predicted for eg in examples]
 | 
				
			||||||
 | 
					        except AttributeError:
 | 
				
			||||||
 | 
					            types = set([type(eg) for eg in examples])
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E978.format(name="EntityLinker", method="update", types=types))
 | 
				
			||||||
        if set_annotations:
 | 
					        if set_annotations:
 | 
				
			||||||
            # This seems simpler than other ways to get that exact output -- but
 | 
					            # This seems simpler than other ways to get that exact output -- but
 | 
				
			||||||
            # it does run the model twice :(
 | 
					            # it does run the model twice :(
 | 
				
			||||||
            predictions = self.model.predict(docs)
 | 
					            predictions = self.model.predict(docs)
 | 
				
			||||||
        golds = [ex.gold for ex in examples]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for doc, gold in zip(docs, golds):
 | 
					        for eg in examples:
 | 
				
			||||||
            ents_by_offset = dict()
 | 
					            sentences = [s for s in eg.predicted.sents]
 | 
				
			||||||
 | 
					            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
 | 
				
			||||||
 | 
					            for ent in eg.predicted.ents:
 | 
				
			||||||
 | 
					                kb_id = kb_ids[ent.start]  # KB ID of the first token is the same as the whole span
 | 
				
			||||||
 | 
					                if kb_id:
 | 
				
			||||||
 | 
					                    try:
 | 
				
			||||||
 | 
					                        # find the sentence in the list of sentences.
 | 
				
			||||||
 | 
					                        sent_index = sentences.index(ent.sent)
 | 
				
			||||||
 | 
					                    except AttributeError:
 | 
				
			||||||
 | 
					                        # Catch the exception when ent.sent is None and provide a user-friendly warning
 | 
				
			||||||
 | 
					                        raise RuntimeError(Errors.E030)
 | 
				
			||||||
 | 
					                    # get n previous sentences, if there are any
 | 
				
			||||||
 | 
					                    start_sentence = max(0, sent_index - self.n_sents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            sentences = [s for s in doc.sents]
 | 
					                    # get n posterior sentences, or as many < n as there are
 | 
				
			||||||
 | 
					                    end_sentence = min(len(sentences) -1, sent_index + self.n_sents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            for ent in doc.ents:
 | 
					                    # get token positions
 | 
				
			||||||
                ents_by_offset[(ent.start_char, ent.end_char)] = ent
 | 
					                    start_token = sentences[start_sentence].start
 | 
				
			||||||
 | 
					                    end_token = sentences[end_sentence].end
 | 
				
			||||||
            for entity, kb_dict in gold.links.items():
 | 
					 | 
				
			||||||
                if isinstance(entity, str):
 | 
					 | 
				
			||||||
                    entity = literal_eval(entity)
 | 
					 | 
				
			||||||
                start, end = entity
 | 
					 | 
				
			||||||
                mention = doc.text[start:end]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
 | 
					 | 
				
			||||||
                if not (start, end) in ents_by_offset:
 | 
					 | 
				
			||||||
                    raise RuntimeError(Errors.E188)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                ent = ents_by_offset[(start, end)]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                for kb_id, value in kb_dict.items():
 | 
					 | 
				
			||||||
                    # Currently only training on the positive instances - we assume there is at least 1 per doc/gold
 | 
					 | 
				
			||||||
                    if value:
 | 
					 | 
				
			||||||
                        try:
 | 
					 | 
				
			||||||
                            # find the sentence in the list of sentences.
 | 
					 | 
				
			||||||
                            sent_index = sentences.index(ent.sent)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                        except AttributeError:
 | 
					 | 
				
			||||||
                            # Catch the exception when ent.sent is None and provide a user-friendly warning
 | 
					 | 
				
			||||||
                            raise RuntimeError(Errors.E030)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                        # get n previous sentences, if there are any
 | 
					 | 
				
			||||||
                        start_sentence = max(0, sent_index - self.n_sents)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                        # get n posterior sentences, or as many < n as there are
 | 
					 | 
				
			||||||
                        end_sentence = min(len(sentences) -1, sent_index + self.n_sents)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                        # get token positions
 | 
					 | 
				
			||||||
                        start_token = sentences[start_sentence].start
 | 
					 | 
				
			||||||
                        end_token = sentences[end_sentence].end
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                        # append that span as a doc to training
 | 
					 | 
				
			||||||
                        sent_doc = doc[start_token:end_token].as_doc()
 | 
					 | 
				
			||||||
                        sentence_docs.append(sent_doc)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    # append that span as a doc to training
 | 
				
			||||||
 | 
					                    sent_doc = eg.predicted[start_token:end_token].as_doc()
 | 
				
			||||||
 | 
					                    sentence_docs.append(sent_doc)
 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					        set_dropout_rate(self.model, drop)
 | 
				
			||||||
 | 
					        if not sentence_docs:
 | 
				
			||||||
 | 
					            warnings.warn(Warnings.W093.format(name="Entity Linker"))
 | 
				
			||||||
 | 
					            return 0.0
 | 
				
			||||||
        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
 | 
					        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
 | 
				
			||||||
        loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds)
 | 
					        loss, d_scores = self.get_similarity_loss(
 | 
				
			||||||
 | 
					            scores=sentence_encodings,
 | 
				
			||||||
 | 
					            examples=examples
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        bp_context(d_scores)
 | 
					        bp_context(d_scores)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if sgd is not None:
 | 
					        if sgd is not None:
 | 
				
			||||||
            self.model.finish_update(sgd)
 | 
					            self.model.finish_update(sgd)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1224,15 +1155,15 @@ class EntityLinker(Pipe):
 | 
				
			||||||
            self.set_annotations(docs, predictions)
 | 
					            self.set_annotations(docs, predictions)
 | 
				
			||||||
        return loss
 | 
					        return loss
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_similarity_loss(self, golds, scores):
 | 
					    def get_similarity_loss(self, examples, scores):
 | 
				
			||||||
        entity_encodings = []
 | 
					        entity_encodings = []
 | 
				
			||||||
        for gold in golds:
 | 
					        for eg in examples:
 | 
				
			||||||
            for entity, kb_dict in gold.links.items():
 | 
					            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
 | 
				
			||||||
                for kb_id, value in kb_dict.items():
 | 
					            for ent in eg.predicted.ents:
 | 
				
			||||||
                    # this loss function assumes we're only using positive examples
 | 
					                kb_id = kb_ids[ent.start]
 | 
				
			||||||
                    if value:
 | 
					                if kb_id:
 | 
				
			||||||
                        entity_encoding = self.kb.get_vector(kb_id)
 | 
					                    entity_encoding = self.kb.get_vector(kb_id)
 | 
				
			||||||
                        entity_encodings.append(entity_encoding)
 | 
					                    entity_encodings.append(entity_encoding)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 | 
					        entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1246,10 +1177,12 @@ class EntityLinker(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_loss(self, examples, scores):
 | 
					    def get_loss(self, examples, scores):
 | 
				
			||||||
        cats = []
 | 
					        cats = []
 | 
				
			||||||
        for ex in examples:
 | 
					        for eg in examples:
 | 
				
			||||||
            for entity, kb_dict in ex.gold.links.items():
 | 
					            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
 | 
				
			||||||
                for kb_id, value in kb_dict.items():
 | 
					            for ent in eg.predicted.ents:
 | 
				
			||||||
                    cats.append([value])
 | 
					                kb_id = kb_ids[ent.start]
 | 
				
			||||||
 | 
					                if kb_id:
 | 
				
			||||||
 | 
					                    cats.append([1.0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cats = self.model.ops.asarray(cats, dtype="float32")
 | 
					        cats = self.model.ops.asarray(cats, dtype="float32")
 | 
				
			||||||
        if len(scores) != len(cats):
 | 
					        if len(scores) != len(cats):
 | 
				
			||||||
| 
						 | 
					@ -1260,27 +1193,16 @@ class EntityLinker(Pipe):
 | 
				
			||||||
        loss = loss / len(cats)
 | 
					        loss = loss / len(cats)
 | 
				
			||||||
        return loss, d_scores
 | 
					        return loss, d_scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, example):
 | 
					    def __call__(self, doc):
 | 
				
			||||||
        doc = self._get_doc(example)
 | 
					 | 
				
			||||||
        kb_ids, tensors = self.predict([doc])
 | 
					        kb_ids, tensors = self.predict([doc])
 | 
				
			||||||
        self.set_annotations([doc], kb_ids, tensors=tensors)
 | 
					        self.set_annotations([doc], kb_ids, tensors=tensors)
 | 
				
			||||||
        if isinstance(example, Example):
 | 
					 | 
				
			||||||
            example.doc = doc
 | 
					 | 
				
			||||||
            return example
 | 
					 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
 | 
					    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
				
			||||||
        for examples in util.minibatch(stream, size=batch_size):
 | 
					        for docs in util.minibatch(stream, size=batch_size):
 | 
				
			||||||
            docs = [self._get_doc(ex) for ex in examples]
 | 
					 | 
				
			||||||
            kb_ids, tensors = self.predict(docs)
 | 
					            kb_ids, tensors = self.predict(docs)
 | 
				
			||||||
            self.set_annotations(docs, kb_ids, tensors=tensors)
 | 
					            self.set_annotations(docs, kb_ids, tensors=tensors)
 | 
				
			||||||
 | 
					            yield from docs
 | 
				
			||||||
            if as_example:
 | 
					 | 
				
			||||||
                for ex, doc in zip(examples, docs):
 | 
					 | 
				
			||||||
                    ex.doc = doc
 | 
					 | 
				
			||||||
                    yield ex
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                yield from docs
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def predict(self, docs):
 | 
					    def predict(self, docs):
 | 
				
			||||||
        """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
 | 
					        """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
 | 
				
			||||||
| 
						 | 
					@ -1466,7 +1388,7 @@ class Sentencizer(Pipe):
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, example):
 | 
					    def __call__(self, doc):
 | 
				
			||||||
        """Apply the sentencizer to a Doc and set Token.is_sent_start.
 | 
					        """Apply the sentencizer to a Doc and set Token.is_sent_start.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        example (Doc or Example): The document to process.
 | 
					        example (Doc or Example): The document to process.
 | 
				
			||||||
| 
						 | 
					@ -1474,7 +1396,6 @@ class Sentencizer(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/sentencizer#call
 | 
					        DOCS: https://spacy.io/api/sentencizer#call
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        doc = self._get_doc(example)
 | 
					 | 
				
			||||||
        start = 0
 | 
					        start = 0
 | 
				
			||||||
        seen_period = False
 | 
					        seen_period = False
 | 
				
			||||||
        for i, token in enumerate(doc):
 | 
					        for i, token in enumerate(doc):
 | 
				
			||||||
| 
						 | 
					@ -1488,26 +1409,17 @@ class Sentencizer(Pipe):
 | 
				
			||||||
                seen_period = True
 | 
					                seen_period = True
 | 
				
			||||||
        if start < len(doc):
 | 
					        if start < len(doc):
 | 
				
			||||||
            doc[start].is_sent_start = True
 | 
					            doc[start].is_sent_start = True
 | 
				
			||||||
        if isinstance(example, Example):
 | 
					 | 
				
			||||||
            example.doc = doc
 | 
					 | 
				
			||||||
            return example
 | 
					 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
 | 
					    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
				
			||||||
        for examples in util.minibatch(stream, size=batch_size):
 | 
					        for docs in util.minibatch(stream, size=batch_size):
 | 
				
			||||||
            docs = [self._get_doc(ex) for ex in examples]
 | 
					 | 
				
			||||||
            predictions = self.predict(docs)
 | 
					            predictions = self.predict(docs)
 | 
				
			||||||
            if isinstance(predictions, tuple) and len(tuple) == 2:
 | 
					            if isinstance(predictions, tuple) and len(tuple) == 2:
 | 
				
			||||||
                scores, tensors = predictions
 | 
					                scores, tensors = predictions
 | 
				
			||||||
                self.set_annotations(docs, scores, tensors=tensors)
 | 
					                self.set_annotations(docs, scores, tensors=tensors)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                self.set_annotations(docs, predictions)
 | 
					                self.set_annotations(docs, predictions)
 | 
				
			||||||
            if as_example:
 | 
					            yield from docs
 | 
				
			||||||
                for ex, doc in zip(examples, docs):
 | 
					 | 
				
			||||||
                    ex.doc = doc
 | 
					 | 
				
			||||||
                    yield ex
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                yield from docs
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def predict(self, docs):
 | 
					    def predict(self, docs):
 | 
				
			||||||
        """Apply the pipeline's model to a batch of docs, without
 | 
					        """Apply the pipeline's model to a batch of docs, without
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -70,8 +70,7 @@ class SimpleNER(Pipe):
 | 
				
			||||||
    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
 | 
					    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
 | 
				
			||||||
        if not any(_has_ner(eg) for eg in examples):
 | 
					        if not any(_has_ner(eg) for eg in examples):
 | 
				
			||||||
            return 0
 | 
					            return 0
 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					        docs = [eg.doc for eg in examples]
 | 
				
			||||||
        docs = [ex.doc for ex in examples]
 | 
					 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					        set_dropout_rate(self.model, drop)
 | 
				
			||||||
        scores, bp_scores = self.model.begin_update(docs)
 | 
					        scores, bp_scores = self.model.begin_update(docs)
 | 
				
			||||||
        loss, d_scores = self.get_loss(examples, scores)
 | 
					        loss, d_scores = self.get_loss(examples, scores)
 | 
				
			||||||
| 
						 | 
					@ -140,8 +139,7 @@ def _has_ner(eg):
 | 
				
			||||||
def _get_labels(examples):
 | 
					def _get_labels(examples):
 | 
				
			||||||
    labels = set()
 | 
					    labels = set()
 | 
				
			||||||
    for eg in examples:
 | 
					    for eg in examples:
 | 
				
			||||||
        for ner_tag in eg.token_annotation.entities:
 | 
					        for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True):
 | 
				
			||||||
            if ner_tag != "O" and ner_tag != "-":
 | 
					            if ner_tag != "O" and ner_tag != "-":
 | 
				
			||||||
                _, label = ner_tag.split("-", 1)
 | 
					                labels.add(ner_tag)
 | 
				
			||||||
                labels.add(label)
 | 
					 | 
				
			||||||
    return list(sorted(labels))
 | 
					    return list(sorted(labels))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,7 +5,7 @@ from ..gold import Example
 | 
				
			||||||
from ..tokens import Doc
 | 
					from ..tokens import Doc
 | 
				
			||||||
from ..vocab import Vocab
 | 
					from ..vocab import Vocab
 | 
				
			||||||
from ..language import component
 | 
					from ..language import component
 | 
				
			||||||
from ..util import link_vectors_to_models, minibatch, eg2doc
 | 
					from ..util import link_vectors_to_models, minibatch
 | 
				
			||||||
from .defaults import default_tok2vec
 | 
					from .defaults import default_tok2vec
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -51,22 +51,18 @@ class Tok2Vec(Pipe):
 | 
				
			||||||
        self.set_annotations([doc], tokvecses)
 | 
					        self.set_annotations([doc], tokvecses)
 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
 | 
					    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
				
			||||||
        """Process `Doc` objects as a stream.
 | 
					        """Process `Doc` objects as a stream.
 | 
				
			||||||
        stream (iterator): A sequence of `Doc` objects to process.
 | 
					        stream (iterator): A sequence of `Doc` objects to process.
 | 
				
			||||||
        batch_size (int): Number of `Doc` objects to group.
 | 
					        batch_size (int): Number of `Doc` objects to group.
 | 
				
			||||||
        n_threads (int): Number of threads.
 | 
					        n_threads (int): Number of threads.
 | 
				
			||||||
        YIELDS (iterator): A sequence of `Doc` objects, in order of input.
 | 
					        YIELDS (iterator): A sequence of `Doc` objects, in order of input.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        for batch in minibatch(stream, batch_size):
 | 
					        for docs in minibatch(stream, batch_size):
 | 
				
			||||||
            batch = list(batch)
 | 
					            docs = list(docs)
 | 
				
			||||||
            if as_example:
 | 
					 | 
				
			||||||
                docs = [eg2doc(doc) for doc in batch]
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                docs = batch
 | 
					 | 
				
			||||||
            tokvecses = self.predict(docs)
 | 
					            tokvecses = self.predict(docs)
 | 
				
			||||||
            self.set_annotations(docs, tokvecses)
 | 
					            self.set_annotations(docs, tokvecses)
 | 
				
			||||||
            yield from batch
 | 
					            yield from docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def predict(self, docs):
 | 
					    def predict(self, docs):
 | 
				
			||||||
        """Return a single tensor for a batch of documents.
 | 
					        """Return a single tensor for a batch of documents.
 | 
				
			||||||
| 
						 | 
					@ -97,8 +93,7 @@ class Tok2Vec(Pipe):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if losses is None:
 | 
					        if losses is None:
 | 
				
			||||||
            losses = {}
 | 
					            losses = {}
 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					        docs = [eg.predicted for eg in examples]
 | 
				
			||||||
        docs = [eg.doc for eg in examples]
 | 
					 | 
				
			||||||
        if isinstance(docs, Doc):
 | 
					        if isinstance(docs, Doc):
 | 
				
			||||||
            docs = [docs]
 | 
					            docs = [docs]
 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					        set_dropout_rate(self.model, drop)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										118
									
								
								spacy/scorer.py
									
									
									
									
									
								
							
							
						
						
									
										118
									
								
								spacy/scorer.py
									
									
									
									
									
								
							| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .gold import tags_to_entities, GoldParse, DocAnnotation
 | 
					 | 
				
			||||||
from .errors import Errors
 | 
					from .errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -275,7 +274,7 @@ class Scorer(object):
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def score(self, example, verbose=False, punct_labels=("p", "punct")):
 | 
					    def score(self, example, verbose=False, punct_labels=("p", "punct")):
 | 
				
			||||||
        """Update the evaluation scores from a single Doc / GoldParse pair.
 | 
					        """Update the evaluation scores from a single Example.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        example (Example): The predicted annotations + correct annotations.
 | 
					        example (Example): The predicted annotations + correct annotations.
 | 
				
			||||||
        verbose (bool): Print debugging information.
 | 
					        verbose (bool): Print debugging information.
 | 
				
			||||||
| 
						 | 
					@ -285,17 +284,9 @@ class Scorer(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/scorer#score
 | 
					        DOCS: https://spacy.io/api/scorer#score
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if isinstance(example, tuple) and len(example) == 2:
 | 
					        doc = example.predicted
 | 
				
			||||||
            doc, gold = example
 | 
					        gold_doc = example.reference
 | 
				
			||||||
        else:
 | 
					        align = example.alignment
 | 
				
			||||||
            gold = example.gold
 | 
					 | 
				
			||||||
            doc = example.doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if len(doc) != len(gold):
 | 
					 | 
				
			||||||
            doc_annotation = DocAnnotation(cats=gold.cats)
 | 
					 | 
				
			||||||
            token_annotation = gold.orig
 | 
					 | 
				
			||||||
            gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation)
 | 
					 | 
				
			||||||
        orig = gold.orig
 | 
					 | 
				
			||||||
        gold_deps = set()
 | 
					        gold_deps = set()
 | 
				
			||||||
        gold_deps_per_dep = {}
 | 
					        gold_deps_per_dep = {}
 | 
				
			||||||
        gold_tags = set()
 | 
					        gold_tags = set()
 | 
				
			||||||
| 
						 | 
					@ -303,36 +294,28 @@ class Scorer(object):
 | 
				
			||||||
        gold_morphs = set()
 | 
					        gold_morphs = set()
 | 
				
			||||||
        gold_morphs_per_feat = {}
 | 
					        gold_morphs_per_feat = {}
 | 
				
			||||||
        gold_sent_starts = set()
 | 
					        gold_sent_starts = set()
 | 
				
			||||||
        gold_ents = set(tags_to_entities(orig.entities))
 | 
					        for gold_i, token in enumerate(gold_doc):
 | 
				
			||||||
        for id_, tag, pos, morph, head, dep, sent_start in zip(
 | 
					            gold_tags.add((gold_i, token.tag_))
 | 
				
			||||||
            orig.ids,
 | 
					            gold_pos.add((gold_i, token.pos_))
 | 
				
			||||||
            orig.tags,
 | 
					            gold_morphs.add((gold_i, token.morph_))
 | 
				
			||||||
            orig.pos,
 | 
					            if token.morph_:
 | 
				
			||||||
            orig.morphs,
 | 
					                for feat in token.morph_.split("|"):
 | 
				
			||||||
            orig.heads,
 | 
					 | 
				
			||||||
            orig.deps,
 | 
					 | 
				
			||||||
            orig.sent_starts,
 | 
					 | 
				
			||||||
        ):
 | 
					 | 
				
			||||||
            gold_tags.add((id_, tag))
 | 
					 | 
				
			||||||
            gold_pos.add((id_, pos))
 | 
					 | 
				
			||||||
            gold_morphs.add((id_, morph))
 | 
					 | 
				
			||||||
            if morph:
 | 
					 | 
				
			||||||
                for feat in morph.split("|"):
 | 
					 | 
				
			||||||
                    field, values = feat.split("=")
 | 
					                    field, values = feat.split("=")
 | 
				
			||||||
                    if field not in self.morphs_per_feat:
 | 
					                    if field not in self.morphs_per_feat:
 | 
				
			||||||
                        self.morphs_per_feat[field] = PRFScore()
 | 
					                        self.morphs_per_feat[field] = PRFScore()
 | 
				
			||||||
                    if field not in gold_morphs_per_feat:
 | 
					                    if field not in gold_morphs_per_feat:
 | 
				
			||||||
                        gold_morphs_per_feat[field] = set()
 | 
					                        gold_morphs_per_feat[field] = set()
 | 
				
			||||||
                    gold_morphs_per_feat[field].add((id_, feat))
 | 
					                    gold_morphs_per_feat[field].add((gold_i, feat))
 | 
				
			||||||
            if sent_start:
 | 
					            if token.sent_start:
 | 
				
			||||||
                gold_sent_starts.add(id_)
 | 
					                gold_sent_starts.add(gold_i)
 | 
				
			||||||
            if dep not in (None, "") and dep.lower() not in punct_labels:
 | 
					            dep = token.dep_.lower()
 | 
				
			||||||
                gold_deps.add((id_, head, dep.lower()))
 | 
					            if dep not in punct_labels:
 | 
				
			||||||
                if dep.lower() not in self.labelled_per_dep:
 | 
					                gold_deps.add((gold_i, token.head.i, dep))
 | 
				
			||||||
                    self.labelled_per_dep[dep.lower()] = PRFScore()
 | 
					                if dep not in self.labelled_per_dep:
 | 
				
			||||||
                if dep.lower() not in gold_deps_per_dep:
 | 
					                    self.labelled_per_dep[dep] = PRFScore()
 | 
				
			||||||
                    gold_deps_per_dep[dep.lower()] = set()
 | 
					                if dep not in gold_deps_per_dep:
 | 
				
			||||||
                gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower()))
 | 
					                    gold_deps_per_dep[dep] = set()
 | 
				
			||||||
 | 
					                gold_deps_per_dep[dep].add((gold_i, token.head.i, dep))
 | 
				
			||||||
        cand_deps = set()
 | 
					        cand_deps = set()
 | 
				
			||||||
        cand_deps_per_dep = {}
 | 
					        cand_deps_per_dep = {}
 | 
				
			||||||
        cand_tags = set()
 | 
					        cand_tags = set()
 | 
				
			||||||
| 
						 | 
					@ -343,7 +326,7 @@ class Scorer(object):
 | 
				
			||||||
        for token in doc:
 | 
					        for token in doc:
 | 
				
			||||||
            if token.orth_.isspace():
 | 
					            if token.orth_.isspace():
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            gold_i = gold.cand_to_gold[token.i]
 | 
					            gold_i = align.cand_to_gold[token.i]
 | 
				
			||||||
            if gold_i is None:
 | 
					            if gold_i is None:
 | 
				
			||||||
                self.tokens.fp += 1
 | 
					                self.tokens.fp += 1
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
| 
						 | 
					@ -362,7 +345,7 @@ class Scorer(object):
 | 
				
			||||||
                if token.is_sent_start:
 | 
					                if token.is_sent_start:
 | 
				
			||||||
                    cand_sent_starts.add(gold_i)
 | 
					                    cand_sent_starts.add(gold_i)
 | 
				
			||||||
            if token.dep_.lower() not in punct_labels and token.orth_.strip():
 | 
					            if token.dep_.lower() not in punct_labels and token.orth_.strip():
 | 
				
			||||||
                gold_head = gold.cand_to_gold[token.head.i]
 | 
					                gold_head = align.cand_to_gold[token.head.i]
 | 
				
			||||||
                # None is indistinct, so we can't just add it to the set
 | 
					                # None is indistinct, so we can't just add it to the set
 | 
				
			||||||
                # Multiple (None, None) deps are possible
 | 
					                # Multiple (None, None) deps are possible
 | 
				
			||||||
                if gold_i is None or gold_head is None:
 | 
					                if gold_i is None or gold_head is None:
 | 
				
			||||||
| 
						 | 
					@ -377,23 +360,30 @@ class Scorer(object):
 | 
				
			||||||
                    cand_deps_per_dep[token.dep_.lower()].add(
 | 
					                    cand_deps_per_dep[token.dep_.lower()].add(
 | 
				
			||||||
                        (gold_i, gold_head, token.dep_.lower())
 | 
					                        (gold_i, gold_head, token.dep_.lower())
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
        if "-" not in [token[-1] for token in orig.entities]:
 | 
					        # Find all NER labels in gold and doc
 | 
				
			||||||
            # Find all NER labels in gold and doc
 | 
					        ent_labels = set(
 | 
				
			||||||
            ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
 | 
					            [k.label_ for k in gold_doc.ents] + [k.label_ for k in doc.ents]
 | 
				
			||||||
            # Set up all labels for per type scoring and prepare gold per type
 | 
					        )
 | 
				
			||||||
            gold_per_ents = {ent_label: set() for ent_label in ent_labels}
 | 
					        # Set up all labels for per type scoring and prepare gold per type
 | 
				
			||||||
            for ent_label in ent_labels:
 | 
					        gold_per_ents = {ent_label: set() for ent_label in ent_labels}
 | 
				
			||||||
                if ent_label not in self.ner_per_ents:
 | 
					        for ent_label in ent_labels:
 | 
				
			||||||
                    self.ner_per_ents[ent_label] = PRFScore()
 | 
					            if ent_label not in self.ner_per_ents:
 | 
				
			||||||
                gold_per_ents[ent_label].update(
 | 
					                self.ner_per_ents[ent_label] = PRFScore()
 | 
				
			||||||
                    [x for x in gold_ents if x[0] == ent_label]
 | 
					        # Find all candidate labels, for all and per type
 | 
				
			||||||
                )
 | 
					        gold_ents = set()
 | 
				
			||||||
            # Find all candidate labels, for all and per type
 | 
					        cand_ents = set()
 | 
				
			||||||
            cand_ents = set()
 | 
					        # If we have missing values in the gold, we can't easily tell whether
 | 
				
			||||||
 | 
					        # our NER predictions are true.
 | 
				
			||||||
 | 
					        # It seems bad but it's what we've always done.
 | 
				
			||||||
 | 
					        if all(token.ent_iob != 0 for token in gold_doc):
 | 
				
			||||||
 | 
					            for ent in gold_doc.ents:
 | 
				
			||||||
 | 
					                gold_ent = (ent.label_, ent.start, ent.end - 1)
 | 
				
			||||||
 | 
					                gold_ents.add(gold_ent)
 | 
				
			||||||
 | 
					                gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
 | 
				
			||||||
            cand_per_ents = {ent_label: set() for ent_label in ent_labels}
 | 
					            cand_per_ents = {ent_label: set() for ent_label in ent_labels}
 | 
				
			||||||
            for ent in doc.ents:
 | 
					            for ent in doc.ents:
 | 
				
			||||||
                first = gold.cand_to_gold[ent.start]
 | 
					                first = align.cand_to_gold[ent.start]
 | 
				
			||||||
                last = gold.cand_to_gold[ent.end - 1]
 | 
					                last = align.cand_to_gold[ent.end - 1]
 | 
				
			||||||
                if first is None or last is None:
 | 
					                if first is None or last is None:
 | 
				
			||||||
                    self.ner.fp += 1
 | 
					                    self.ner.fp += 1
 | 
				
			||||||
                    self.ner_per_ents[ent.label_].fp += 1
 | 
					                    self.ner_per_ents[ent.label_].fp += 1
 | 
				
			||||||
| 
						 | 
					@ -424,40 +414,40 @@ class Scorer(object):
 | 
				
			||||||
            set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
 | 
					            set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        if (
 | 
					        if (
 | 
				
			||||||
            len(gold.cats) > 0
 | 
					            len(gold_doc.cats) > 0
 | 
				
			||||||
            and set(self.textcat_f_per_cat)
 | 
					            and set(self.textcat_f_per_cat)
 | 
				
			||||||
            == set(self.textcat_auc_per_cat)
 | 
					            == set(self.textcat_auc_per_cat)
 | 
				
			||||||
            == set(gold.cats)
 | 
					            == set(gold_doc.cats)
 | 
				
			||||||
            and set(gold.cats) == set(doc.cats)
 | 
					            and set(gold_doc.cats) == set(doc.cats)
 | 
				
			||||||
        ):
 | 
					        ):
 | 
				
			||||||
            goldcat = max(gold.cats, key=gold.cats.get)
 | 
					            goldcat = max(gold_doc.cats, key=gold_doc.cats.get)
 | 
				
			||||||
            candcat = max(doc.cats, key=doc.cats.get)
 | 
					            candcat = max(doc.cats, key=doc.cats.get)
 | 
				
			||||||
            if self.textcat_positive_label:
 | 
					            if self.textcat_positive_label:
 | 
				
			||||||
                self.textcat.score_set(
 | 
					                self.textcat.score_set(
 | 
				
			||||||
                    set([self.textcat_positive_label]) & set([candcat]),
 | 
					                    set([self.textcat_positive_label]) & set([candcat]),
 | 
				
			||||||
                    set([self.textcat_positive_label]) & set([goldcat]),
 | 
					                    set([self.textcat_positive_label]) & set([goldcat]),
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            for label in set(gold.cats):
 | 
					            for label in set(gold_doc.cats):
 | 
				
			||||||
                self.textcat_auc_per_cat[label].score_set(
 | 
					                self.textcat_auc_per_cat[label].score_set(
 | 
				
			||||||
                    doc.cats[label], gold.cats[label]
 | 
					                    doc.cats[label], gold_doc.cats[label]
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                self.textcat_f_per_cat[label].score_set(
 | 
					                self.textcat_f_per_cat[label].score_set(
 | 
				
			||||||
                    set([label]) & set([candcat]), set([label]) & set([goldcat])
 | 
					                    set([label]) & set([candcat]), set([label]) & set([goldcat])
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
        elif len(self.textcat_f_per_cat) > 0:
 | 
					        elif len(self.textcat_f_per_cat) > 0:
 | 
				
			||||||
            model_labels = set(self.textcat_f_per_cat)
 | 
					            model_labels = set(self.textcat_f_per_cat)
 | 
				
			||||||
            eval_labels = set(gold.cats)
 | 
					            eval_labels = set(gold_doc.cats)
 | 
				
			||||||
            raise ValueError(
 | 
					            raise ValueError(
 | 
				
			||||||
                Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
 | 
					                Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        elif len(self.textcat_auc_per_cat) > 0:
 | 
					        elif len(self.textcat_auc_per_cat) > 0:
 | 
				
			||||||
            model_labels = set(self.textcat_auc_per_cat)
 | 
					            model_labels = set(self.textcat_auc_per_cat)
 | 
				
			||||||
            eval_labels = set(gold.cats)
 | 
					            eval_labels = set(gold_doc.cats)
 | 
				
			||||||
            raise ValueError(
 | 
					            raise ValueError(
 | 
				
			||||||
                Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
 | 
					                Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        if verbose:
 | 
					        if verbose:
 | 
				
			||||||
            gold_words = orig.words
 | 
					            gold_words = gold_doc.words
 | 
				
			||||||
            for w_id, h_id, dep in cand_deps - gold_deps:
 | 
					            for w_id, h_id, dep in cand_deps - gold_deps:
 | 
				
			||||||
                print("F", gold_words[w_id], dep, gold_words[h_id])
 | 
					                print("F", gold_words[w_id], dep, gold_words[h_id])
 | 
				
			||||||
            for w_id, h_id, dep in gold_deps - cand_deps:
 | 
					            for w_id, h_id, dep in gold_deps - cand_deps:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +0,0 @@
 | 
				
			||||||
from ..typedefs cimport hash_t, class_t
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# These are passed as callbacks to thinc.search.Beam
 | 
					 | 
				
			||||||
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef int check_final_state(void* _state, void* extra_args) except -1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef hash_t hash_state(void* _state, void* _) except 0
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,329 +0,0 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					 | 
				
			||||||
cimport numpy as np
 | 
					 | 
				
			||||||
from cpython.ref cimport PyObject, Py_XDECREF
 | 
					 | 
				
			||||||
from thinc.extra.search cimport Beam
 | 
					 | 
				
			||||||
from thinc.extra.search cimport MaxViolation
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from thinc.extra.search import MaxViolation
 | 
					 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..typedefs cimport hash_t, class_t
 | 
					 | 
				
			||||||
from .transition_system cimport TransitionSystem, Transition
 | 
					 | 
				
			||||||
from ..gold cimport GoldParse
 | 
					 | 
				
			||||||
from .stateclass cimport StateC, StateClass
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..errors import Errors
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# These are passed as callbacks to thinc.search.Beam
 | 
					 | 
				
			||||||
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
 | 
					 | 
				
			||||||
    dest = <StateC*>_dest
 | 
					 | 
				
			||||||
    src = <StateC*>_src
 | 
					 | 
				
			||||||
    moves = <const Transition*>_moves
 | 
					 | 
				
			||||||
    dest.clone(src)
 | 
					 | 
				
			||||||
    moves[clas].do(dest, moves[clas].label)
 | 
					 | 
				
			||||||
    dest.push_hist(clas)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef int check_final_state(void* _state, void* extra_args) except -1:
 | 
					 | 
				
			||||||
    state = <StateC*>_state
 | 
					 | 
				
			||||||
    return state.is_final()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef hash_t hash_state(void* _state, void* _) except 0:
 | 
					 | 
				
			||||||
    state = <StateC*>_state
 | 
					 | 
				
			||||||
    if state.is_final():
 | 
					 | 
				
			||||||
        return 1
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return state.hash()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def collect_states(beams):
 | 
					 | 
				
			||||||
    cdef StateClass state
 | 
					 | 
				
			||||||
    cdef Beam beam
 | 
					 | 
				
			||||||
    states = []
 | 
					 | 
				
			||||||
    for state_or_beam in beams:
 | 
					 | 
				
			||||||
        if isinstance(state_or_beam, StateClass):
 | 
					 | 
				
			||||||
            states.append(state_or_beam)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            beam = state_or_beam
 | 
					 | 
				
			||||||
            state = StateClass.borrow(<StateC*>beam.at(0))
 | 
					 | 
				
			||||||
            states.append(state)
 | 
					 | 
				
			||||||
    return states
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class ParserBeam(object):
 | 
					 | 
				
			||||||
    cdef public TransitionSystem moves
 | 
					 | 
				
			||||||
    cdef public object states
 | 
					 | 
				
			||||||
    cdef public object golds
 | 
					 | 
				
			||||||
    cdef public object beams
 | 
					 | 
				
			||||||
    cdef public object dones
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, TransitionSystem moves, states, golds,
 | 
					 | 
				
			||||||
                 int width, float density=0.):
 | 
					 | 
				
			||||||
        self.moves = moves
 | 
					 | 
				
			||||||
        self.states = states
 | 
					 | 
				
			||||||
        self.golds = golds
 | 
					 | 
				
			||||||
        self.beams = []
 | 
					 | 
				
			||||||
        cdef Beam beam
 | 
					 | 
				
			||||||
        cdef StateClass state
 | 
					 | 
				
			||||||
        cdef StateC* st
 | 
					 | 
				
			||||||
        for state in states:
 | 
					 | 
				
			||||||
            beam = Beam(self.moves.n_moves, width, min_density=density)
 | 
					 | 
				
			||||||
            beam.initialize(self.moves.init_beam_state,
 | 
					 | 
				
			||||||
                            self.moves.del_beam_state, state.c.length,
 | 
					 | 
				
			||||||
                            state.c._sent)
 | 
					 | 
				
			||||||
            for i in range(beam.width):
 | 
					 | 
				
			||||||
                st = <StateC*>beam.at(i)
 | 
					 | 
				
			||||||
                st.offset = state.c.offset
 | 
					 | 
				
			||||||
            self.beams.append(beam)
 | 
					 | 
				
			||||||
        self.dones = [False] * len(self.beams)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @property
 | 
					 | 
				
			||||||
    def is_done(self):
 | 
					 | 
				
			||||||
        return all(b.is_done or self.dones[i]
 | 
					 | 
				
			||||||
                   for i, b in enumerate(self.beams))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __getitem__(self, i):
 | 
					 | 
				
			||||||
        return self.beams[i]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __len__(self):
 | 
					 | 
				
			||||||
        return len(self.beams)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def advance(self, scores, follow_gold=False):
 | 
					 | 
				
			||||||
        cdef Beam beam
 | 
					 | 
				
			||||||
        for i, beam in enumerate(self.beams):
 | 
					 | 
				
			||||||
            if beam.is_done or not scores[i].size or self.dones[i]:
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            self._set_scores(beam, scores[i])
 | 
					 | 
				
			||||||
            if self.golds is not None:
 | 
					 | 
				
			||||||
                self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
 | 
					 | 
				
			||||||
            beam.advance(transition_state, hash_state, <void*>self.moves.c)
 | 
					 | 
				
			||||||
            beam.check_done(check_final_state, NULL)
 | 
					 | 
				
			||||||
            # This handles the non-monotonic stuff for the parser.
 | 
					 | 
				
			||||||
            if beam.is_done and self.golds is not None:
 | 
					 | 
				
			||||||
                for j in range(beam.size):
 | 
					 | 
				
			||||||
                    state = StateClass.borrow(<StateC*>beam.at(j))
 | 
					 | 
				
			||||||
                    if state.is_final():
 | 
					 | 
				
			||||||
                        try:
 | 
					 | 
				
			||||||
                            if self.moves.is_gold_parse(state, self.golds[i]):
 | 
					 | 
				
			||||||
                                beam._states[j].loss = 0.0
 | 
					 | 
				
			||||||
                        except NotImplementedError:
 | 
					 | 
				
			||||||
                            break
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _set_scores(self, Beam beam, float[:, ::1] scores):
 | 
					 | 
				
			||||||
        cdef float* c_scores = &scores[0, 0]
 | 
					 | 
				
			||||||
        cdef int nr_state = min(scores.shape[0], beam.size)
 | 
					 | 
				
			||||||
        cdef int nr_class = scores.shape[1]
 | 
					 | 
				
			||||||
        for i in range(nr_state):
 | 
					 | 
				
			||||||
            state = <StateC*>beam.at(i)
 | 
					 | 
				
			||||||
            if not state.is_final():
 | 
					 | 
				
			||||||
                for j in range(nr_class):
 | 
					 | 
				
			||||||
                    beam.scores[i][j] = c_scores[i * nr_class + j]
 | 
					 | 
				
			||||||
                self.moves.set_valid(beam.is_valid[i], state)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                for j in range(beam.nr_class):
 | 
					 | 
				
			||||||
                    beam.scores[i][j] = 0
 | 
					 | 
				
			||||||
                    beam.costs[i][j] = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
 | 
					 | 
				
			||||||
        for i in range(beam.size):
 | 
					 | 
				
			||||||
            state = StateClass.borrow(<StateC*>beam.at(i))
 | 
					 | 
				
			||||||
            if not state.is_final():
 | 
					 | 
				
			||||||
                self.moves.set_costs(beam.is_valid[i], beam.costs[i],
 | 
					 | 
				
			||||||
                                     state, gold)
 | 
					 | 
				
			||||||
                if follow_gold:
 | 
					 | 
				
			||||||
                    min_cost = 0
 | 
					 | 
				
			||||||
                    for j in range(beam.nr_class):
 | 
					 | 
				
			||||||
                        if beam.is_valid[i][j] and beam.costs[i][j] < min_cost:
 | 
					 | 
				
			||||||
                            min_cost = beam.costs[i][j]
 | 
					 | 
				
			||||||
                    for j in range(beam.nr_class):
 | 
					 | 
				
			||||||
                        if beam.costs[i][j] > min_cost:
 | 
					 | 
				
			||||||
                            beam.is_valid[i][j] = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_token_ids(states, int n_tokens):
 | 
					 | 
				
			||||||
    cdef StateClass state
 | 
					 | 
				
			||||||
    cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
 | 
					 | 
				
			||||||
                                      dtype='int32', order='C')
 | 
					 | 
				
			||||||
    c_ids = <int*>ids.data
 | 
					 | 
				
			||||||
    for i, state in enumerate(states):
 | 
					 | 
				
			||||||
        if not state.is_final():
 | 
					 | 
				
			||||||
            state.c.set_context_tokens(c_ids, n_tokens)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            ids[i] = -1
 | 
					 | 
				
			||||||
        c_ids += ids.shape[1]
 | 
					 | 
				
			||||||
    return ids
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
nr_update = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
 | 
					 | 
				
			||||||
                states, golds,
 | 
					 | 
				
			||||||
                state2vec, vec2scores,
 | 
					 | 
				
			||||||
                int width, losses=None, drop=0.,
 | 
					 | 
				
			||||||
                early_update=True, beam_density=0.0):
 | 
					 | 
				
			||||||
    global nr_update
 | 
					 | 
				
			||||||
    cdef MaxViolation violn
 | 
					 | 
				
			||||||
    nr_update += 1
 | 
					 | 
				
			||||||
    pbeam = ParserBeam(moves, states, golds, width=width, density=beam_density)
 | 
					 | 
				
			||||||
    gbeam = ParserBeam(moves, states, golds, width=width, density=beam_density)
 | 
					 | 
				
			||||||
    cdef StateClass state
 | 
					 | 
				
			||||||
    beam_maps = []
 | 
					 | 
				
			||||||
    backprops = []
 | 
					 | 
				
			||||||
    violns = [MaxViolation() for _ in range(len(states))]
 | 
					 | 
				
			||||||
    for t in range(max_steps):
 | 
					 | 
				
			||||||
        if pbeam.is_done and gbeam.is_done:
 | 
					 | 
				
			||||||
            break
 | 
					 | 
				
			||||||
        # The beam maps let us find the right row in the flattened scores
 | 
					 | 
				
			||||||
        # arrays for each state. States are identified by (example id,
 | 
					 | 
				
			||||||
        # history). We keep a different beam map for each step (since we'll
 | 
					 | 
				
			||||||
        # have a flat scores array for each step). The beam map will let us
 | 
					 | 
				
			||||||
        # take the per-state losses, and compute the gradient for each (step,
 | 
					 | 
				
			||||||
        # state, class).
 | 
					 | 
				
			||||||
        beam_maps.append({})
 | 
					 | 
				
			||||||
        # Gather all states from the two beams in a list. Some stats may occur
 | 
					 | 
				
			||||||
        # in both beams. To figure out which beam each state belonged to,
 | 
					 | 
				
			||||||
        # we keep two lists of indices, p_indices and g_indices
 | 
					 | 
				
			||||||
        states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
 | 
					 | 
				
			||||||
                                                  nr_update)
 | 
					 | 
				
			||||||
        if not states:
 | 
					 | 
				
			||||||
            break
 | 
					 | 
				
			||||||
        # Now that we have our flat list of states, feed them through the model
 | 
					 | 
				
			||||||
        token_ids = get_token_ids(states, nr_feature)
 | 
					 | 
				
			||||||
        vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
 | 
					 | 
				
			||||||
        scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Store the callbacks for the backward pass
 | 
					 | 
				
			||||||
        backprops.append((token_ids, bp_vectors, bp_scores))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Unpack the flat scores into lists for the two beams. The indices arrays
 | 
					 | 
				
			||||||
        # tell us which example and state the scores-row refers to.
 | 
					 | 
				
			||||||
        p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
 | 
					 | 
				
			||||||
                    for indices in p_indices]
 | 
					 | 
				
			||||||
        g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
 | 
					 | 
				
			||||||
                    for indices in g_indices]
 | 
					 | 
				
			||||||
        # Now advance the states in the beams. The gold beam is constrained to
 | 
					 | 
				
			||||||
        # to follow only gold analyses.
 | 
					 | 
				
			||||||
        pbeam.advance(p_scores)
 | 
					 | 
				
			||||||
        gbeam.advance(g_scores, follow_gold=True)
 | 
					 | 
				
			||||||
        # Track the "maximum violation", to use in the update.
 | 
					 | 
				
			||||||
        for i, violn in enumerate(violns):
 | 
					 | 
				
			||||||
            violn.check_crf(pbeam[i], gbeam[i])
 | 
					 | 
				
			||||||
    histories = []
 | 
					 | 
				
			||||||
    losses = []
 | 
					 | 
				
			||||||
    for violn in violns:
 | 
					 | 
				
			||||||
        if violn.p_hist:
 | 
					 | 
				
			||||||
            histories.append(violn.p_hist + violn.g_hist)
 | 
					 | 
				
			||||||
            losses.append(violn.p_probs + violn.g_probs)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            histories.append([])
 | 
					 | 
				
			||||||
            losses.append([])
 | 
					 | 
				
			||||||
    states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
 | 
					 | 
				
			||||||
    beams = list(pbeam.beams) + list(gbeam.beams)
 | 
					 | 
				
			||||||
    return states_d_scores, backprops[:len(states_d_scores)], beams
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_states(pbeams, gbeams, beam_map, nr_update):
 | 
					 | 
				
			||||||
    seen = {}
 | 
					 | 
				
			||||||
    states = []
 | 
					 | 
				
			||||||
    p_indices = []
 | 
					 | 
				
			||||||
    g_indices = []
 | 
					 | 
				
			||||||
    cdef Beam pbeam, gbeam
 | 
					 | 
				
			||||||
    if len(pbeams) != len(gbeams):
 | 
					 | 
				
			||||||
        raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams)))
 | 
					 | 
				
			||||||
    for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
 | 
					 | 
				
			||||||
        p_indices.append([])
 | 
					 | 
				
			||||||
        g_indices.append([])
 | 
					 | 
				
			||||||
        for i in range(pbeam.size):
 | 
					 | 
				
			||||||
            state = StateClass.borrow(<StateC*>pbeam.at(i))
 | 
					 | 
				
			||||||
            if not state.is_final():
 | 
					 | 
				
			||||||
                key = tuple([eg_id] + pbeam.histories[i])
 | 
					 | 
				
			||||||
                if key in seen:
 | 
					 | 
				
			||||||
                    raise ValueError(Errors.E080.format(key=key))
 | 
					 | 
				
			||||||
                seen[key] = len(states)
 | 
					 | 
				
			||||||
                p_indices[-1].append(len(states))
 | 
					 | 
				
			||||||
                states.append(state)
 | 
					 | 
				
			||||||
        beam_map.update(seen)
 | 
					 | 
				
			||||||
        for i in range(gbeam.size):
 | 
					 | 
				
			||||||
            state = StateClass.borrow(<StateC*>gbeam.at(i))
 | 
					 | 
				
			||||||
            if not state.is_final():
 | 
					 | 
				
			||||||
                key = tuple([eg_id] + gbeam.histories[i])
 | 
					 | 
				
			||||||
                if key in seen:
 | 
					 | 
				
			||||||
                    g_indices[-1].append(seen[key])
 | 
					 | 
				
			||||||
                else:
 | 
					 | 
				
			||||||
                    g_indices[-1].append(len(states))
 | 
					 | 
				
			||||||
                    beam_map[key] = len(states)
 | 
					 | 
				
			||||||
                    states.append(state)
 | 
					 | 
				
			||||||
    p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
 | 
					 | 
				
			||||||
    g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
 | 
					 | 
				
			||||||
    return states, p_idx, g_idx
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_gradient(nr_class, beam_maps, histories, losses):
 | 
					 | 
				
			||||||
    """The global model assigns a loss to each parse. The beam scores
 | 
					 | 
				
			||||||
    are additive, so the same gradient is applied to each action
 | 
					 | 
				
			||||||
    in the history. This gives the gradient of a single *action*
 | 
					 | 
				
			||||||
    for a beam state -- so we have "the gradient of loss for taking
 | 
					 | 
				
			||||||
    action i given history H."
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Histories: Each hitory is a list of actions
 | 
					 | 
				
			||||||
    Each candidate has a history
 | 
					 | 
				
			||||||
    Each beam has multiple candidates
 | 
					 | 
				
			||||||
    Each batch has multiple beams
 | 
					 | 
				
			||||||
    So history is list of lists of lists of ints
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    grads = []
 | 
					 | 
				
			||||||
    nr_steps = []
 | 
					 | 
				
			||||||
    for eg_id, hists in enumerate(histories):
 | 
					 | 
				
			||||||
        nr_step = 0
 | 
					 | 
				
			||||||
        for loss, hist in zip(losses[eg_id], hists):
 | 
					 | 
				
			||||||
            if loss != 0.0 and not numpy.isnan(loss):
 | 
					 | 
				
			||||||
                nr_step = max(nr_step, len(hist))
 | 
					 | 
				
			||||||
        nr_steps.append(nr_step)
 | 
					 | 
				
			||||||
    for i in range(max(nr_steps)):
 | 
					 | 
				
			||||||
        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
 | 
					 | 
				
			||||||
                                 dtype='f'))
 | 
					 | 
				
			||||||
    if len(histories) != len(losses):
 | 
					 | 
				
			||||||
        raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses)))
 | 
					 | 
				
			||||||
    for eg_id, hists in enumerate(histories):
 | 
					 | 
				
			||||||
        for loss, hist in zip(losses[eg_id], hists):
 | 
					 | 
				
			||||||
            if loss == 0.0 or numpy.isnan(loss):
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            key = tuple([eg_id])
 | 
					 | 
				
			||||||
            # Adjust loss for length
 | 
					 | 
				
			||||||
            # We need to do this because each state in a short path is scored
 | 
					 | 
				
			||||||
            # multiple times, as we add in the average cost when we run out
 | 
					 | 
				
			||||||
            # of actions.
 | 
					 | 
				
			||||||
            avg_loss = loss / len(hist)
 | 
					 | 
				
			||||||
            loss += avg_loss * (nr_steps[eg_id] - len(hist))
 | 
					 | 
				
			||||||
            for j, clas in enumerate(hist):
 | 
					 | 
				
			||||||
                i = beam_maps[j][key]
 | 
					 | 
				
			||||||
                # In step j, at state i action clas
 | 
					 | 
				
			||||||
                # resulted in loss
 | 
					 | 
				
			||||||
                grads[j][i, clas] += loss
 | 
					 | 
				
			||||||
                key = key + tuple([clas])
 | 
					 | 
				
			||||||
    return grads
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def cleanup_beam(Beam beam):
 | 
					 | 
				
			||||||
    cdef StateC* state
 | 
					 | 
				
			||||||
    # Once parsing has finished, states in beam may not be unique. Is this
 | 
					 | 
				
			||||||
    # correct?
 | 
					 | 
				
			||||||
    seen = set()
 | 
					 | 
				
			||||||
    for i in range(beam.width):
 | 
					 | 
				
			||||||
        addr = <size_t>beam._parents[i].content
 | 
					 | 
				
			||||||
        if addr not in seen:
 | 
					 | 
				
			||||||
            state = <StateC*>addr
 | 
					 | 
				
			||||||
            del state
 | 
					 | 
				
			||||||
            seen.add(addr)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            raise ValueError(Errors.E023.format(addr=addr, i=i))
 | 
					 | 
				
			||||||
        addr = <size_t>beam._states[i].content
 | 
					 | 
				
			||||||
        if addr not in seen:
 | 
					 | 
				
			||||||
            state = <StateC*>addr
 | 
					 | 
				
			||||||
            del state
 | 
					 | 
				
			||||||
            seen.add(addr)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            raise ValueError(Errors.E023.format(addr=addr, i=i))
 | 
					 | 
				
			||||||
| 
						 | 
					@ -16,7 +16,6 @@ from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..typedefs cimport weight_t, class_t, hash_t
 | 
					from ..typedefs cimport weight_t, class_t, hash_t
 | 
				
			||||||
from ..tokens.doc cimport Doc
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
from ..gold cimport GoldParse
 | 
					 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
from .transition_system cimport Transition
 | 
					from .transition_system cimport Transition
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -24,7 +23,6 @@ from ..compat import copy_array
 | 
				
			||||||
from ..errors import Errors, TempErrors
 | 
					from ..errors import Errors, TempErrors
 | 
				
			||||||
from ..util import link_vectors_to_models, create_default_optimizer
 | 
					from ..util import link_vectors_to_models, create_default_optimizer
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from . import _beam_utils
 | 
					 | 
				
			||||||
from . import nonproj
 | 
					from . import nonproj
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -261,8 +259,7 @@ class ParserStepModel(Model):
 | 
				
			||||||
    def mark_class_seen(self, class_):
 | 
					    def mark_class_seen(self, class_):
 | 
				
			||||||
        self._class_mask[class_] = 1
 | 
					        self._class_mask[class_] = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_token_ids(self, batch):
 | 
					    def get_token_ids(self, states):
 | 
				
			||||||
        states = _beam_utils.collect_states(batch)
 | 
					 | 
				
			||||||
        cdef StateClass state
 | 
					        cdef StateClass state
 | 
				
			||||||
        states = [state for state in states if not state.is_final()]
 | 
					        states = [state for state in states if not state.is_final()]
 | 
				
			||||||
        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
 | 
					        cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,12 +3,11 @@ from cymem.cymem cimport Pool
 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
from ..typedefs cimport weight_t, attr_t
 | 
					from ..typedefs cimport weight_t, attr_t
 | 
				
			||||||
from .transition_system cimport TransitionSystem, Transition
 | 
					from .transition_system cimport TransitionSystem, Transition
 | 
				
			||||||
from ..gold cimport GoldParseC
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class ArcEager(TransitionSystem):
 | 
					cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
    pass
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil
 | 
					cdef weight_t push_cost(StateClass stcls, const void* _gold, int target) nogil
 | 
				
			||||||
cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil
 | 
					cdef weight_t arc_cost(StateClass stcls, const void* _gold, int head, int child) nogil
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,19 +1,19 @@
 | 
				
			||||||
# cython: profile=True, cdivision=True, infer_types=True
 | 
					# cython: profile=True, cdivision=True, infer_types=True
 | 
				
			||||||
from cpython.ref cimport Py_INCREF
 | 
					from cpython.ref cimport Py_INCREF
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool, Address
 | 
				
			||||||
from thinc.extra.search cimport Beam
 | 
					from libc.stdint cimport int32_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from collections import defaultdict, Counter
 | 
					from collections import defaultdict, Counter
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..typedefs cimport hash_t, attr_t
 | 
					from ..typedefs cimport hash_t, attr_t
 | 
				
			||||||
from ..strings cimport hash_string
 | 
					from ..strings cimport hash_string
 | 
				
			||||||
from ..gold cimport GoldParse, GoldParseC
 | 
					 | 
				
			||||||
from ..structs cimport TokenC
 | 
					from ..structs cimport TokenC
 | 
				
			||||||
from ..tokens.doc cimport Doc, set_children_from_heads
 | 
					from ..tokens.doc cimport Doc, set_children_from_heads
 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
from ._state cimport StateC
 | 
					from ._state cimport StateC
 | 
				
			||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
 | 
					from .transition_system cimport move_cost_func_t, label_cost_func_t
 | 
				
			||||||
 | 
					from ..gold.example cimport Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
from .nonproj import is_nonproj_tree
 | 
					from .nonproj import is_nonproj_tree
 | 
				
			||||||
| 
						 | 
					@ -49,53 +49,232 @@ MOVE_NAMES[RIGHT] = 'R'
 | 
				
			||||||
MOVE_NAMES[BREAK] = 'B'
 | 
					MOVE_NAMES[BREAK] = 'B'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef enum:
 | 
				
			||||||
 | 
					    HEAD_IN_STACK = 0
 | 
				
			||||||
 | 
					    HEAD_IN_BUFFER
 | 
				
			||||||
 | 
					    HEAD_UNKNOWN
 | 
				
			||||||
 | 
					    IS_SENT_START
 | 
				
			||||||
 | 
					    SENT_START_UNKNOWN
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef struct GoldParseStateC:
 | 
				
			||||||
 | 
					    char* state_bits
 | 
				
			||||||
 | 
					    int32_t* n_kids_in_buffer
 | 
				
			||||||
 | 
					    int32_t* n_kids_in_stack
 | 
				
			||||||
 | 
					    int32_t* heads
 | 
				
			||||||
 | 
					    attr_t* labels
 | 
				
			||||||
 | 
					    int32_t** kids
 | 
				
			||||||
 | 
					    int32_t* n_kids
 | 
				
			||||||
 | 
					    int32_t length
 | 
				
			||||||
 | 
					    int32_t stride
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls,
 | 
				
			||||||
 | 
					        heads, labels, sent_starts) except *:
 | 
				
			||||||
 | 
					    cdef GoldParseStateC gs
 | 
				
			||||||
 | 
					    gs.length = len(heads)
 | 
				
			||||||
 | 
					    gs.stride = 1
 | 
				
			||||||
 | 
					    gs.labels = <attr_t*>mem.alloc(gs.length, sizeof(gs.labels[0]))
 | 
				
			||||||
 | 
					    gs.heads = <int32_t*>mem.alloc(gs.length, sizeof(gs.heads[0]))
 | 
				
			||||||
 | 
					    gs.n_kids = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids[0]))
 | 
				
			||||||
 | 
					    gs.state_bits = <char*>mem.alloc(gs.length, sizeof(gs.state_bits[0]))
 | 
				
			||||||
 | 
					    gs.n_kids_in_buffer = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_buffer[0]))
 | 
				
			||||||
 | 
					    gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for i, is_sent_start in enumerate(sent_starts):
 | 
				
			||||||
 | 
					        if is_sent_start == True:
 | 
				
			||||||
 | 
					            gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
 | 
					                gs.state_bits[i],
 | 
				
			||||||
 | 
					                IS_SENT_START,
 | 
				
			||||||
 | 
					                1
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
 | 
					                gs.state_bits[i],
 | 
				
			||||||
 | 
					                SENT_START_UNKNOWN,
 | 
				
			||||||
 | 
					                0
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					        elif is_sent_start is None:
 | 
				
			||||||
 | 
					            gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
 | 
					                gs.state_bits[i],
 | 
				
			||||||
 | 
					                SENT_START_UNKNOWN,
 | 
				
			||||||
 | 
					                1
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
 | 
					                gs.state_bits[i],
 | 
				
			||||||
 | 
					                IS_SENT_START,
 | 
				
			||||||
 | 
					                0
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
 | 
					                gs.state_bits[i],
 | 
				
			||||||
 | 
					                SENT_START_UNKNOWN,
 | 
				
			||||||
 | 
					                0
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
 | 
					                gs.state_bits[i],
 | 
				
			||||||
 | 
					                IS_SENT_START,
 | 
				
			||||||
 | 
					                0
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					    for i, (head, label) in enumerate(zip(heads, labels)):
 | 
				
			||||||
 | 
					        if head is not None:
 | 
				
			||||||
 | 
					            gs.heads[i] = head
 | 
				
			||||||
 | 
					            gs.labels[i] = label
 | 
				
			||||||
 | 
					            if i != head:
 | 
				
			||||||
 | 
					                gs.n_kids[head] += 1
 | 
				
			||||||
 | 
					            gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
 | 
					                gs.state_bits[i],
 | 
				
			||||||
 | 
					                HEAD_UNKNOWN,
 | 
				
			||||||
 | 
					                0
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
 | 
					                gs.state_bits[i],
 | 
				
			||||||
 | 
					                HEAD_UNKNOWN,
 | 
				
			||||||
 | 
					                1
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					    # Make an array of pointers, pointing into the gs_kids_flat array.
 | 
				
			||||||
 | 
					    gs.kids = <int32_t**>mem.alloc(gs.length, sizeof(int32_t*))
 | 
				
			||||||
 | 
					    for i in range(gs.length):
 | 
				
			||||||
 | 
					        if gs.n_kids[i] != 0:
 | 
				
			||||||
 | 
					            gs.kids[i] = <int32_t*>mem.alloc(gs.n_kids[i], sizeof(int32_t))
 | 
				
			||||||
 | 
					    # This is a temporary buffer
 | 
				
			||||||
 | 
					    js_addr = Address(gs.length, sizeof(int32_t))
 | 
				
			||||||
 | 
					    js = <int32_t*>js_addr.ptr
 | 
				
			||||||
 | 
					    for i in range(gs.length):
 | 
				
			||||||
 | 
					        if not is_head_unknown(&gs, i):
 | 
				
			||||||
 | 
					            head = gs.heads[i]
 | 
				
			||||||
 | 
					            if head != i:
 | 
				
			||||||
 | 
					                gs.kids[head][js[head]] = i
 | 
				
			||||||
 | 
					                js[head] += 1
 | 
				
			||||||
 | 
					    return gs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef void update_gold_state(GoldParseStateC* gs, StateClass stcls) nogil:
 | 
				
			||||||
 | 
					    for i in range(gs.length):
 | 
				
			||||||
 | 
					        gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
 | 
					            gs.state_bits[i],
 | 
				
			||||||
 | 
					            HEAD_IN_BUFFER,
 | 
				
			||||||
 | 
					            0
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        gs.state_bits[i] = set_state_flag(
 | 
				
			||||||
 | 
					            gs.state_bits[i],
 | 
				
			||||||
 | 
					            HEAD_IN_STACK,
 | 
				
			||||||
 | 
					            0
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        gs.n_kids_in_stack[i] = 0
 | 
				
			||||||
 | 
					        gs.n_kids_in_buffer[i] = 0
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					    for i in range(stcls.stack_depth()):
 | 
				
			||||||
 | 
					        s_i = stcls.S(i)
 | 
				
			||||||
 | 
					        if not is_head_unknown(gs, s_i):
 | 
				
			||||||
 | 
					            gs.n_kids_in_stack[gs.heads[s_i]] += 1
 | 
				
			||||||
 | 
					        for kid in gs.kids[s_i][:gs.n_kids[s_i]]:
 | 
				
			||||||
 | 
					            gs.state_bits[kid] = set_state_flag(
 | 
				
			||||||
 | 
					                gs.state_bits[kid],
 | 
				
			||||||
 | 
					                HEAD_IN_STACK,
 | 
				
			||||||
 | 
					                1
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					    for i in range(stcls.buffer_length()):
 | 
				
			||||||
 | 
					        b_i = stcls.B(i)
 | 
				
			||||||
 | 
					        if not is_head_unknown(gs, b_i):
 | 
				
			||||||
 | 
					            gs.n_kids_in_buffer[gs.heads[b_i]] += 1
 | 
				
			||||||
 | 
					        for kid in gs.kids[b_i][:gs.n_kids[b_i]]:
 | 
				
			||||||
 | 
					            gs.state_bits[kid] = set_state_flag(
 | 
				
			||||||
 | 
					                gs.state_bits[kid],
 | 
				
			||||||
 | 
					                HEAD_IN_BUFFER,
 | 
				
			||||||
 | 
					                1
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class ArcEagerGold:
 | 
				
			||||||
 | 
					    cdef GoldParseStateC c
 | 
				
			||||||
 | 
					    cdef Pool mem
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, ArcEager moves, StateClass stcls, Example example):
 | 
				
			||||||
 | 
					        self.mem = Pool()
 | 
				
			||||||
 | 
					        heads, labels = example.get_aligned_parse(projectivize=True)
 | 
				
			||||||
 | 
					        labels = [label if label is not None else "" for label in labels]
 | 
				
			||||||
 | 
					        labels = [example.x.vocab.strings.add(label) for label in labels]
 | 
				
			||||||
 | 
					        sent_starts = example.get_aligned("SENT_START")
 | 
				
			||||||
 | 
					        assert len(heads) == len(labels) == len(sent_starts)
 | 
				
			||||||
 | 
					        self.c = create_gold_state(self.mem, stcls, heads, labels, sent_starts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def update(self, StateClass stcls):
 | 
				
			||||||
 | 
					        update_gold_state(&self.c, stcls)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int check_state_gold(char state_bits, char flag) nogil:
 | 
				
			||||||
 | 
					    cdef char one = 1
 | 
				
			||||||
 | 
					    return state_bits & (one << flag)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int set_state_flag(char state_bits, char flag, int value) nogil:
 | 
				
			||||||
 | 
					    cdef char one = 1
 | 
				
			||||||
 | 
					    if value:
 | 
				
			||||||
 | 
					        return state_bits | (one << flag)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        return state_bits & ~(one << flag)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int is_head_in_stack(const GoldParseStateC* gold, int i) nogil:
 | 
				
			||||||
 | 
					    return check_state_gold(gold.state_bits[i], HEAD_IN_STACK)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int is_head_in_buffer(const GoldParseStateC* gold, int i) nogil:
 | 
				
			||||||
 | 
					    return check_state_gold(gold.state_bits[i], HEAD_IN_BUFFER)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int is_head_unknown(const GoldParseStateC* gold, int i) nogil:
 | 
				
			||||||
 | 
					    return check_state_gold(gold.state_bits[i], HEAD_UNKNOWN)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int is_sent_start(const GoldParseStateC* gold, int i) nogil:
 | 
				
			||||||
 | 
					    return check_state_gold(gold.state_bits[i], IS_SENT_START)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef int is_sent_start_unknown(const GoldParseStateC* gold, int i) nogil:
 | 
				
			||||||
 | 
					    return check_state_gold(gold.state_bits[i], SENT_START_UNKNOWN)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Helper functions for the arc-eager oracle
 | 
					# Helper functions for the arc-eager oracle
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
 | 
					cdef weight_t push_cost(StateClass stcls, const void* _gold, int target) nogil:
 | 
				
			||||||
 | 
					    gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
    cdef weight_t cost = 0
 | 
					    cdef weight_t cost = 0
 | 
				
			||||||
    cdef int i, S_i
 | 
					    if is_head_in_stack(gold, target):
 | 
				
			||||||
    for i in range(stcls.stack_depth()):
 | 
					        cost += 1
 | 
				
			||||||
        S_i = stcls.S(i)
 | 
					    cost += gold.n_kids_in_stack[target]
 | 
				
			||||||
        if gold.heads[target] == S_i:
 | 
					 | 
				
			||||||
            cost += 1
 | 
					 | 
				
			||||||
        if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
 | 
					 | 
				
			||||||
            cost += 1
 | 
					 | 
				
			||||||
        if BINARY_COSTS and cost >= 1:
 | 
					 | 
				
			||||||
            return cost
 | 
					 | 
				
			||||||
    cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
 | 
					 | 
				
			||||||
    return cost
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nogil:
 | 
					 | 
				
			||||||
    cdef weight_t cost = 0
 | 
					 | 
				
			||||||
    cdef int i, B_i
 | 
					 | 
				
			||||||
    for i in range(stcls.buffer_length()):
 | 
					 | 
				
			||||||
        B_i = stcls.B(i)
 | 
					 | 
				
			||||||
        cost += gold.heads[B_i] == target
 | 
					 | 
				
			||||||
        cost += gold.heads[target] == B_i
 | 
					 | 
				
			||||||
        if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
 | 
					 | 
				
			||||||
            break
 | 
					 | 
				
			||||||
        if BINARY_COSTS and cost >= 1:
 | 
					 | 
				
			||||||
            return cost
 | 
					 | 
				
			||||||
    if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
 | 
					    if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
 | 
				
			||||||
        cost += 1
 | 
					        cost += 1
 | 
				
			||||||
    return cost
 | 
					    return cost
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int child) nogil:
 | 
					cdef weight_t pop_cost(StateClass stcls, const void* _gold, int target) nogil:
 | 
				
			||||||
 | 
					    gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
 | 
					    cdef weight_t cost = 0
 | 
				
			||||||
 | 
					    if is_head_in_buffer(gold, target):
 | 
				
			||||||
 | 
					        cost += 1
 | 
				
			||||||
 | 
					    cost += gold[0].n_kids_in_buffer[target]
 | 
				
			||||||
 | 
					    if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
 | 
				
			||||||
 | 
					        cost += 1
 | 
				
			||||||
 | 
					    return cost
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef weight_t arc_cost(StateClass stcls, const void* _gold, int head, int child) nogil:
 | 
				
			||||||
 | 
					    gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
    if arc_is_gold(gold, head, child):
 | 
					    if arc_is_gold(gold, head, child):
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
    elif stcls.H(child) == gold.heads[child]:
 | 
					    elif stcls.H(child) == gold.heads[child]:
 | 
				
			||||||
        return 1
 | 
					        return 1
 | 
				
			||||||
    # Head in buffer
 | 
					    # Head in buffer
 | 
				
			||||||
    elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != 0:
 | 
					    elif is_head_in_buffer(gold, child):
 | 
				
			||||||
        return 1
 | 
					        return 1
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
 | 
					cdef bint arc_is_gold(const GoldParseStateC* gold, int head, int child) nogil:
 | 
				
			||||||
    if not gold.has_dep[child]:
 | 
					    if is_head_unknown(gold, child):
 | 
				
			||||||
        return True
 | 
					        return True
 | 
				
			||||||
    elif gold.heads[child] == head:
 | 
					    elif gold.heads[child] == head:
 | 
				
			||||||
        return True
 | 
					        return True
 | 
				
			||||||
| 
						 | 
					@ -103,8 +282,8 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
 | 
					cdef bint label_is_gold(const GoldParseStateC* gold, int head, int child, attr_t label) nogil:
 | 
				
			||||||
    if not gold.has_dep[child]:
 | 
					    if is_head_unknown(gold, child):
 | 
				
			||||||
        return True
 | 
					        return True
 | 
				
			||||||
    elif label == 0:
 | 
					    elif label == 0:
 | 
				
			||||||
        return True
 | 
					        return True
 | 
				
			||||||
| 
						 | 
					@ -114,8 +293,9 @@ cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t labe
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
 | 
					cdef bint _is_gold_root(const GoldParseStateC* gold, int word) nogil:
 | 
				
			||||||
    return gold.heads[word] == word or not gold.has_dep[word]
 | 
					    return gold.heads[word] == word or is_head_unknown(gold, word)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Shift:
 | 
					cdef class Shift:
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
| 
						 | 
					@ -129,15 +309,17 @@ cdef class Shift:
 | 
				
			||||||
        st.fast_forward()
 | 
					        st.fast_forward()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(StateClass st, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
        return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
 | 
					        return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline weight_t move_cost(StateClass s, const GoldParseC* gold) nogil:
 | 
					    cdef inline weight_t move_cost(StateClass s, const void* _gold) nogil:
 | 
				
			||||||
 | 
					        gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
        return push_cost(s, gold, s.B(0))
 | 
					        return push_cost(s, gold, s.B(0))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef inline weight_t label_cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -155,26 +337,28 @@ cdef class Reduce:
 | 
				
			||||||
        st.fast_forward()
 | 
					        st.fast_forward()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
        return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
 | 
					        return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline weight_t move_cost(StateClass st, const GoldParseC* gold) nogil:
 | 
					    cdef inline weight_t move_cost(StateClass st, const void* _gold) nogil:
 | 
				
			||||||
        cost = pop_cost(st, gold, st.S(0))
 | 
					        gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
        if not st.has_head(st.S(0)):
 | 
					        s0 = st.S(0)
 | 
				
			||||||
            # Decrement cost for the arcs e save
 | 
					        cost = pop_cost(st, gold, s0)
 | 
				
			||||||
            for i in range(1, st.stack_depth()):
 | 
					        return_to_buffer = not st.has_head(s0)
 | 
				
			||||||
                S_i = st.S(i)
 | 
					        if return_to_buffer:
 | 
				
			||||||
                if gold.heads[st.S(0)] == S_i:
 | 
					            # Decrement cost for the arcs we save, as we'll be putting this
 | 
				
			||||||
                    cost -= 1
 | 
					            # back to the buffer
 | 
				
			||||||
                if gold.heads[S_i] == st.S(0):
 | 
					            if is_head_in_stack(gold, s0):
 | 
				
			||||||
                    cost -= 1
 | 
					                cost -= 1
 | 
				
			||||||
 | 
					            cost -= gold.n_kids_in_stack[s0]
 | 
				
			||||||
            if Break.is_valid(st.c, 0) and Break.move_cost(st, gold) == 0:
 | 
					            if Break.is_valid(st.c, 0) and Break.move_cost(st, gold) == 0:
 | 
				
			||||||
                cost -= 1
 | 
					                cost -= 1
 | 
				
			||||||
        return cost
 | 
					        return cost
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef inline weight_t label_cost(StateClass s, const void* gold, attr_t label) nogil:
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -193,25 +377,28 @@ cdef class LeftArc:
 | 
				
			||||||
        st.fast_forward()
 | 
					        st.fast_forward()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef inline weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
        return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
 | 
					        return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline weight_t move_cost(StateClass s, const GoldParseC* gold) nogil:
 | 
					    cdef inline weight_t move_cost(StateClass s, const GoldParseStateC* gold) nogil:
 | 
				
			||||||
        cdef weight_t cost = 0
 | 
					        cdef weight_t cost = 0
 | 
				
			||||||
        if arc_is_gold(gold, s.B(0), s.S(0)):
 | 
					        s0 = s.S(0)
 | 
				
			||||||
 | 
					        b0 = s.B(0)
 | 
				
			||||||
 | 
					        if arc_is_gold(gold, b0, s0):
 | 
				
			||||||
            # Have a negative cost if we 'recover' from the wrong dependency
 | 
					            # Have a negative cost if we 'recover' from the wrong dependency
 | 
				
			||||||
            return 0 if not s.has_head(s.S(0)) else -1
 | 
					            return 0 if not s.has_head(s0) else -1
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            # Account for deps we might lose between S0 and stack
 | 
					            # Account for deps we might lose between S0 and stack
 | 
				
			||||||
            if not s.has_head(s.S(0)):
 | 
					            if not s.has_head(s0):
 | 
				
			||||||
                for i in range(1, s.stack_depth()):
 | 
					                cost += gold.n_kids_in_stack[s0]
 | 
				
			||||||
                    cost += gold.heads[s.S(i)] == s.S(0)
 | 
					                if is_head_in_buffer(gold, s0):
 | 
				
			||||||
                    cost += gold.heads[s.S(0)] == s.S(i)
 | 
					                    cost += 1
 | 
				
			||||||
            return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
 | 
					            return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef inline weight_t label_cost(StateClass s, const GoldParseStateC* gold, attr_t label) nogil:
 | 
				
			||||||
        return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
 | 
					        return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -231,11 +418,13 @@ cdef class RightArc:
 | 
				
			||||||
        st.fast_forward()
 | 
					        st.fast_forward()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef inline weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
        return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
 | 
					        return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline weight_t move_cost(StateClass s, const GoldParseC* gold) nogil:
 | 
					    cdef inline weight_t move_cost(StateClass s, const void* _gold) nogil:
 | 
				
			||||||
 | 
					        gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
        if arc_is_gold(gold, s.S(0), s.B(0)):
 | 
					        if arc_is_gold(gold, s.S(0), s.B(0)):
 | 
				
			||||||
            return 0
 | 
					            return 0
 | 
				
			||||||
        elif s.c.shifted[s.B(0)]:
 | 
					        elif s.c.shifted[s.B(0)]:
 | 
				
			||||||
| 
						 | 
					@ -244,7 +433,8 @@ cdef class RightArc:
 | 
				
			||||||
            return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
 | 
					            return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef weight_t label_cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
        return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
 | 
					        return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -271,23 +461,22 @@ cdef class Break:
 | 
				
			||||||
        st.fast_forward()
 | 
					        st.fast_forward()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
        return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
 | 
					        return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline weight_t move_cost(StateClass s, const GoldParseC* gold) nogil:
 | 
					    cdef inline weight_t move_cost(StateClass s, const void* _gold) nogil:
 | 
				
			||||||
        cdef weight_t cost = 0
 | 
					        gold = <const GoldParseStateC*>_gold
 | 
				
			||||||
        cdef int i, j, S_i, B_i
 | 
					        cost = 0
 | 
				
			||||||
        for i in range(s.stack_depth()):
 | 
					        for i in range(s.stack_depth()):
 | 
				
			||||||
            S_i = s.S(i)
 | 
					            S_i = s.S(i)
 | 
				
			||||||
            for j in range(s.buffer_length()):
 | 
					            cost += gold.n_kids_in_buffer[S_i]
 | 
				
			||||||
                B_i = s.B(j)
 | 
					            if is_head_in_buffer(gold, S_i):
 | 
				
			||||||
                cost += gold.heads[S_i] == B_i
 | 
					                cost += 1
 | 
				
			||||||
                cost += gold.heads[B_i] == S_i
 | 
					        # It's weird not to check the gold sentence boundaries but if we do,
 | 
				
			||||||
                if cost != 0:
 | 
					        # we can't account for "sunk costs", i.e. situations where we're already
 | 
				
			||||||
                    return cost
 | 
					        # wrong.
 | 
				
			||||||
        # Check for sentence boundary --- if it's here, we can't have any deps
 | 
					 | 
				
			||||||
        # between stack and buffer, so rest of action is irrelevant.
 | 
					 | 
				
			||||||
        s0_root = _get_root(s.S(0), gold)
 | 
					        s0_root = _get_root(s.S(0), gold)
 | 
				
			||||||
        b0_root = _get_root(s.B(0), gold)
 | 
					        b0_root = _get_root(s.B(0), gold)
 | 
				
			||||||
        if s0_root != b0_root or s0_root == -1 or b0_root == -1:
 | 
					        if s0_root != b0_root or s0_root == -1 or b0_root == -1:
 | 
				
			||||||
| 
						 | 
					@ -296,14 +485,16 @@ cdef class Break:
 | 
				
			||||||
            return cost + 1
 | 
					            return cost + 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef inline weight_t label_cost(StateClass s, const void* gold, attr_t label) nogil:
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef int _get_root(int word, const GoldParseC* gold) nogil:
 | 
					cdef int _get_root(int word, const GoldParseStateC* gold) nogil:
 | 
				
			||||||
    while gold.heads[word] != word and gold.has_dep[word] and word >= 0:
 | 
					    if is_head_unknown(gold, word):
 | 
				
			||||||
        word = gold.heads[word]
 | 
					 | 
				
			||||||
    if not gold.has_dep[word]:
 | 
					 | 
				
			||||||
        return -1
 | 
					        return -1
 | 
				
			||||||
 | 
					    while gold.heads[word] != word and word >= 0:
 | 
				
			||||||
 | 
					        word = gold.heads[word]
 | 
				
			||||||
 | 
					        if is_head_unknown(gold, word):
 | 
				
			||||||
 | 
					            return -1
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        return word
 | 
					        return word
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -330,8 +521,6 @@ cdef int _del_state(Pool mem, void* state, void* x) except -1:
 | 
				
			||||||
cdef class ArcEager(TransitionSystem):
 | 
					cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
    def __init__(self, *args, **kwargs):
 | 
					    def __init__(self, *args, **kwargs):
 | 
				
			||||||
        TransitionSystem.__init__(self, *args, **kwargs)
 | 
					        TransitionSystem.__init__(self, *args, **kwargs)
 | 
				
			||||||
        self.init_beam_state = _init_state
 | 
					 | 
				
			||||||
        self.del_beam_state = _del_state
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def get_actions(cls, **kwargs):
 | 
					    def get_actions(cls, **kwargs):
 | 
				
			||||||
| 
						 | 
					@ -345,10 +534,11 @@ cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
        for label in kwargs.get('right_labels', []):
 | 
					        for label in kwargs.get('right_labels', []):
 | 
				
			||||||
            actions[RIGHT][label] = 1
 | 
					            actions[RIGHT][label] = 1
 | 
				
			||||||
            actions[REDUCE][label] = 1
 | 
					            actions[REDUCE][label] = 1
 | 
				
			||||||
        for example in kwargs.get('gold_parses', []):
 | 
					        for example in kwargs.get('examples', []):
 | 
				
			||||||
            heads, labels = nonproj.projectivize(example.token_annotation.heads,
 | 
					            heads, labels = example.get_aligned_parse(projectivize=True)
 | 
				
			||||||
                                                 example.token_annotation.deps)
 | 
					            for child, (head, label) in enumerate(zip(heads, labels)):
 | 
				
			||||||
            for child, head, label in zip(example.token_annotation.ids, heads, labels):
 | 
					                if head is None or label is None:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
                if label.upper() == 'ROOT' :
 | 
					                if label.upper() == 'ROOT' :
 | 
				
			||||||
                    label = 'ROOT'
 | 
					                    label = 'ROOT'
 | 
				
			||||||
                if head == child:
 | 
					                if head == child:
 | 
				
			||||||
| 
						 | 
					@ -378,102 +568,47 @@ cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
    def action_types(self):
 | 
					    def action_types(self):
 | 
				
			||||||
        return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
 | 
					        return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_cost(self, StateClass state, GoldParse gold, action):
 | 
					 | 
				
			||||||
        cdef Transition t = self.lookup_transition(action)
 | 
					 | 
				
			||||||
        if not t.is_valid(state.c, t.label):
 | 
					 | 
				
			||||||
            return 9000
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return t.get_cost(state, &gold.c, t.label)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def transition(self, StateClass state, action):
 | 
					    def transition(self, StateClass state, action):
 | 
				
			||||||
        cdef Transition t = self.lookup_transition(action)
 | 
					        cdef Transition t = self.lookup_transition(action)
 | 
				
			||||||
        t.do(state.c, t.label)
 | 
					        t.do(state.c, t.label)
 | 
				
			||||||
        return state
 | 
					        return state
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def is_gold_parse(self, StateClass state, GoldParse gold):
 | 
					    def is_gold_parse(self, StateClass state, gold):
 | 
				
			||||||
        predicted = set()
 | 
					        raise NotImplementedError
 | 
				
			||||||
        truth = set()
 | 
					 | 
				
			||||||
        for i in range(gold.length):
 | 
					 | 
				
			||||||
            if gold.cand_to_gold[i] is None:
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            if state.safe_get(i).dep:
 | 
					 | 
				
			||||||
                predicted.add((i, state.H(i),
 | 
					 | 
				
			||||||
                              self.strings[state.safe_get(i).dep]))
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                predicted.add((i, state.H(i), 'ROOT'))
 | 
					 | 
				
			||||||
            id_ = gold.orig.ids[gold.cand_to_gold[i]]
 | 
					 | 
				
			||||||
            head = gold.orig.heads[gold.cand_to_gold[i]]
 | 
					 | 
				
			||||||
            dep = gold.orig.deps[gold.cand_to_gold[i]]
 | 
					 | 
				
			||||||
            truth.add((id_, head, dep))
 | 
					 | 
				
			||||||
        return truth == predicted
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def has_gold(self, GoldParse gold, start=0, end=None):
 | 
					    def init_gold(self, StateClass state, Example example):
 | 
				
			||||||
        end = end or len(gold.heads)
 | 
					        gold = ArcEagerGold(self, state, example)
 | 
				
			||||||
        if all([tag is None for tag in gold.heads[start:end]]):
 | 
					        self._replace_unseen_labels(gold)
 | 
				
			||||||
            return False
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def preprocess_gold(self, GoldParse gold):
 | 
					 | 
				
			||||||
        if not self.has_gold(gold):
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
        # Figure out whether we're using subtok
 | 
					 | 
				
			||||||
        use_subtok = False
 | 
					 | 
				
			||||||
        for action, labels in self.labels.items():
 | 
					 | 
				
			||||||
            if SUBTOK_LABEL in labels:
 | 
					 | 
				
			||||||
                use_subtok = True
 | 
					 | 
				
			||||||
                break
 | 
					 | 
				
			||||||
        for i, (head, dep) in enumerate(zip(gold.heads, gold.labels)):
 | 
					 | 
				
			||||||
            # Missing values
 | 
					 | 
				
			||||||
            if head is None or dep is None:
 | 
					 | 
				
			||||||
                gold.c.heads[i] = i
 | 
					 | 
				
			||||||
                gold.c.has_dep[i] = False
 | 
					 | 
				
			||||||
            elif dep == SUBTOK_LABEL and not use_subtok:
 | 
					 | 
				
			||||||
                # If we're not doing the joint tokenization and parsing,
 | 
					 | 
				
			||||||
                # regard these subtok labels as missing
 | 
					 | 
				
			||||||
                gold.c.heads[i] = i
 | 
					 | 
				
			||||||
                gold.c.labels[i] = 0
 | 
					 | 
				
			||||||
                gold.c.has_dep[i] = False
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                if head > i:
 | 
					 | 
				
			||||||
                    action = LEFT
 | 
					 | 
				
			||||||
                elif head < i:
 | 
					 | 
				
			||||||
                    action = RIGHT
 | 
					 | 
				
			||||||
                else:
 | 
					 | 
				
			||||||
                    action = BREAK
 | 
					 | 
				
			||||||
                if dep not in self.labels[action]:
 | 
					 | 
				
			||||||
                    if action == BREAK:
 | 
					 | 
				
			||||||
                        dep = 'ROOT'
 | 
					 | 
				
			||||||
                    elif nonproj.is_decorated(dep):
 | 
					 | 
				
			||||||
                        backoff = nonproj.decompose(dep)[0]
 | 
					 | 
				
			||||||
                        if backoff in self.labels[action]:
 | 
					 | 
				
			||||||
                            dep = backoff
 | 
					 | 
				
			||||||
                        else:
 | 
					 | 
				
			||||||
                            dep = 'dep'
 | 
					 | 
				
			||||||
                    else:
 | 
					 | 
				
			||||||
                        dep = 'dep'
 | 
					 | 
				
			||||||
                gold.c.has_dep[i] = True
 | 
					 | 
				
			||||||
                if dep.upper() == 'ROOT':
 | 
					 | 
				
			||||||
                    dep = 'ROOT'
 | 
					 | 
				
			||||||
                gold.c.heads[i] = head
 | 
					 | 
				
			||||||
                gold.c.labels[i] = self.strings.add(dep)
 | 
					 | 
				
			||||||
        return gold
 | 
					        return gold
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_beam_parses(self, Beam beam):
 | 
					    def init_gold_batch(self, examples):
 | 
				
			||||||
        parses = []
 | 
					        all_states = self.init_batch([eg.predicted for eg in examples])
 | 
				
			||||||
        probs = beam.probs
 | 
					        golds = []
 | 
				
			||||||
        for i in range(beam.size):
 | 
					        states = []
 | 
				
			||||||
            state = <StateC*>beam.at(i)
 | 
					        for state, eg in zip(all_states, examples):
 | 
				
			||||||
            if state.is_final():
 | 
					            if self.has_gold(eg) and not state.is_final():
 | 
				
			||||||
                self.finalize_state(state)
 | 
					                golds.append(self.init_gold(state, eg))
 | 
				
			||||||
                prob = probs[i]
 | 
					                states.append(state)
 | 
				
			||||||
                parse = []
 | 
					        n_steps = sum([len(s.queue) for s in states])
 | 
				
			||||||
                for j in range(state.length):
 | 
					        return states, golds, n_steps
 | 
				
			||||||
                    head = state.H(j)
 | 
					
 | 
				
			||||||
                    label = self.strings[state._sent[j].dep]
 | 
					    def _replace_unseen_labels(self, ArcEagerGold gold):
 | 
				
			||||||
                    parse.append((head, j, label))
 | 
					        backoff_label = self.strings["dep"]
 | 
				
			||||||
                parses.append((prob, parse))
 | 
					        root_label = self.strings["ROOT"]
 | 
				
			||||||
        return parses
 | 
					        left_labels = self.labels[LEFT]
 | 
				
			||||||
 | 
					        right_labels = self.labels[RIGHT]
 | 
				
			||||||
 | 
					        break_labels = self.labels[BREAK]
 | 
				
			||||||
 | 
					        for i in range(gold.c.length):
 | 
				
			||||||
 | 
					            if not is_head_unknown(&gold.c, i):
 | 
				
			||||||
 | 
					                head = gold.c.heads[i]
 | 
				
			||||||
 | 
					                label = self.strings[gold.c.labels[i]]
 | 
				
			||||||
 | 
					                if head > i and label not in left_labels:
 | 
				
			||||||
 | 
					                    gold.c.labels[i] = backoff_label
 | 
				
			||||||
 | 
					                elif head < i and label not in right_labels:
 | 
				
			||||||
 | 
					                    gold.c.labels[i] = backoff_label
 | 
				
			||||||
 | 
					                elif head == i and label not in break_labels:
 | 
				
			||||||
 | 
					                    gold.c.labels[i] = root_label
 | 
				
			||||||
 | 
					        return gold
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Transition lookup_transition(self, object name_or_id) except *:
 | 
					    cdef Transition lookup_transition(self, object name_or_id) except *:
 | 
				
			||||||
        if isinstance(name_or_id, int):
 | 
					        if isinstance(name_or_id, int):
 | 
				
			||||||
| 
						 | 
					@ -489,7 +624,7 @@ cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
        for i in range(self.n_moves):
 | 
					        for i in range(self.n_moves):
 | 
				
			||||||
            if self.c[i].move == move and self.c[i].label == label:
 | 
					            if self.c[i].move == move and self.c[i].label == label:
 | 
				
			||||||
                return self.c[i]
 | 
					                return self.c[i]
 | 
				
			||||||
        return Transition(clas=0, move=MISSING, label=0)
 | 
					        raise KeyError(f"Unknown transition: {name}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def move_name(self, int move, attr_t label):
 | 
					    def move_name(self, int move, attr_t label):
 | 
				
			||||||
        label_str = self.strings[label]
 | 
					        label_str = self.strings[label]
 | 
				
			||||||
| 
						 | 
					@ -554,6 +689,13 @@ cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
        doc.is_parsed = True
 | 
					        doc.is_parsed = True
 | 
				
			||||||
        set_children_from_heads(doc.c, doc.length)
 | 
					        set_children_from_heads(doc.c, doc.length)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def has_gold(self, Example eg, start=0, end=None):
 | 
				
			||||||
 | 
					        for word in eg.y[start:end]:
 | 
				
			||||||
 | 
					            if word.dep != 0:
 | 
				
			||||||
 | 
					                return True
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int set_valid(self, int* output, const StateC* st) nogil:
 | 
					    cdef int set_valid(self, int* output, const StateC* st) nogil:
 | 
				
			||||||
        cdef bint[N_MOVES] is_valid
 | 
					        cdef bint[N_MOVES] is_valid
 | 
				
			||||||
        is_valid[SHIFT] = Shift.is_valid(st, 0)
 | 
					        is_valid[SHIFT] = Shift.is_valid(st, 0)
 | 
				
			||||||
| 
						 | 
					@ -568,67 +710,109 @@ cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                output[i] = is_valid[self.c[i].move]
 | 
					                output[i] = is_valid[self.c[i].move]
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					    def get_cost(self, StateClass stcls, gold, int i):
 | 
				
			||||||
 | 
					        if not isinstance(gold, ArcEagerGold):
 | 
				
			||||||
 | 
					            raise TypeError("Expected ArcEagerGold")
 | 
				
			||||||
 | 
					        cdef ArcEagerGold gold_ = gold
 | 
				
			||||||
 | 
					        gold_state = gold_.c
 | 
				
			||||||
 | 
					        n_gold = 0
 | 
				
			||||||
 | 
					        if self.c[i].is_valid(stcls.c, self.c[i].label):
 | 
				
			||||||
 | 
					            cost = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            cost = 9000
 | 
				
			||||||
 | 
					        return cost
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int set_costs(self, int* is_valid, weight_t* costs,
 | 
					    cdef int set_costs(self, int* is_valid, weight_t* costs,
 | 
				
			||||||
                       StateClass stcls, GoldParse gold) except -1:
 | 
					                       StateClass stcls, gold) except -1:
 | 
				
			||||||
        cdef int i, move
 | 
					        if not isinstance(gold, ArcEagerGold):
 | 
				
			||||||
        cdef attr_t label
 | 
					            raise TypeError("Expected ArcEagerGold")
 | 
				
			||||||
        cdef label_cost_func_t[N_MOVES] label_cost_funcs
 | 
					        cdef ArcEagerGold gold_ = gold
 | 
				
			||||||
        cdef move_cost_func_t[N_MOVES] move_cost_funcs
 | 
					        gold_.update(stcls)
 | 
				
			||||||
        cdef weight_t[N_MOVES] move_costs
 | 
					        gold_state = gold_.c
 | 
				
			||||||
        for i in range(N_MOVES):
 | 
					 | 
				
			||||||
            move_costs[i] = 9000
 | 
					 | 
				
			||||||
        move_cost_funcs[SHIFT] = Shift.move_cost
 | 
					 | 
				
			||||||
        move_cost_funcs[REDUCE] = Reduce.move_cost
 | 
					 | 
				
			||||||
        move_cost_funcs[LEFT] = LeftArc.move_cost
 | 
					 | 
				
			||||||
        move_cost_funcs[RIGHT] = RightArc.move_cost
 | 
					 | 
				
			||||||
        move_cost_funcs[BREAK] = Break.move_cost
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        label_cost_funcs[SHIFT] = Shift.label_cost
 | 
					 | 
				
			||||||
        label_cost_funcs[REDUCE] = Reduce.label_cost
 | 
					 | 
				
			||||||
        label_cost_funcs[LEFT] = LeftArc.label_cost
 | 
					 | 
				
			||||||
        label_cost_funcs[RIGHT] = RightArc.label_cost
 | 
					 | 
				
			||||||
        label_cost_funcs[BREAK] = Break.label_cost
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        cdef attr_t* labels = gold.c.labels
 | 
					 | 
				
			||||||
        cdef int* heads = gold.c.heads
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        n_gold = 0
 | 
					        n_gold = 0
 | 
				
			||||||
        for i in range(self.n_moves):
 | 
					        for i in range(self.n_moves):
 | 
				
			||||||
            if self.c[i].is_valid(stcls.c, self.c[i].label):
 | 
					            if self.c[i].is_valid(stcls.c, self.c[i].label):
 | 
				
			||||||
                is_valid[i] = True
 | 
					                is_valid[i] = True
 | 
				
			||||||
                move = self.c[i].move
 | 
					                costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
 | 
				
			||||||
                label = self.c[i].label
 | 
					 | 
				
			||||||
                if move_costs[move] == 9000:
 | 
					 | 
				
			||||||
                    move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
 | 
					 | 
				
			||||||
                costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
 | 
					 | 
				
			||||||
                n_gold += costs[i] <= 0
 | 
					                n_gold += costs[i] <= 0
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                is_valid[i] = False
 | 
					                is_valid[i] = False
 | 
				
			||||||
                costs[i] = 9000
 | 
					                costs[i] = 9000
 | 
				
			||||||
        if n_gold < 1:
 | 
					        if n_gold < 1:
 | 
				
			||||||
            # Check projectivity --- leading cause
 | 
					            raise ValueError
 | 
				
			||||||
            if is_nonproj_tree(gold.heads):
 | 
					 | 
				
			||||||
                raise ValueError(Errors.E020)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                failure_state = stcls.print_state(gold.words)
 | 
					 | 
				
			||||||
                raise ValueError(Errors.E021.format(n_actions=self.n_moves,
 | 
					 | 
				
			||||||
                                                    state=failure_state))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_beam_annot(self, Beam beam):
 | 
					    def get_oracle_sequence(self, Example example):
 | 
				
			||||||
        length = (<StateC*>beam.at(0)).length
 | 
					        cdef StateClass state
 | 
				
			||||||
        heads = [{} for _ in range(length)]
 | 
					        cdef ArcEagerGold gold
 | 
				
			||||||
        deps = [{} for _ in range(length)]
 | 
					        states, golds, n_steps = self.init_gold_batch([example])
 | 
				
			||||||
        probs = beam.probs
 | 
					        if not golds:
 | 
				
			||||||
        for i in range(beam.size):
 | 
					            return []
 | 
				
			||||||
            state = <StateC*>beam.at(i)
 | 
					
 | 
				
			||||||
            self.finalize_state(state)
 | 
					        cdef Pool mem = Pool()
 | 
				
			||||||
            if state.is_final():
 | 
					        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
 | 
				
			||||||
                prob = probs[i]
 | 
					        assert self.n_moves > 0
 | 
				
			||||||
                for j in range(state.length):
 | 
					        costs = <float*>mem.alloc(self.n_moves, sizeof(float))
 | 
				
			||||||
                    head = j + state._sent[j].head
 | 
					        is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
 | 
				
			||||||
                    dep = state._sent[j].dep
 | 
					
 | 
				
			||||||
                    heads[j].setdefault(head, 0.0)
 | 
					        state = states[0]
 | 
				
			||||||
                    heads[j][head] += prob
 | 
					        gold = golds[0]
 | 
				
			||||||
                    deps[j].setdefault(dep, 0.0)
 | 
					        history = []
 | 
				
			||||||
                    deps[j][dep] += prob
 | 
					        debug_log = []
 | 
				
			||||||
        return heads, deps
 | 
					        failed = False
 | 
				
			||||||
 | 
					        while not state.is_final():
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                self.set_costs(is_valid, costs, state, gold)
 | 
				
			||||||
 | 
					            except ValueError:
 | 
				
			||||||
 | 
					                failed = True
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					            for i in range(self.n_moves):
 | 
				
			||||||
 | 
					                if is_valid[i] and costs[i] <= 0:
 | 
				
			||||||
 | 
					                    action = self.c[i]
 | 
				
			||||||
 | 
					                    history.append(i)
 | 
				
			||||||
 | 
					                    s0 = state.S(0)
 | 
				
			||||||
 | 
					                    b0 = state.B(0)
 | 
				
			||||||
 | 
					                    debug_log.append(" ".join((
 | 
				
			||||||
 | 
					                        self.get_class_name(i),
 | 
				
			||||||
 | 
					                        "S0=", (example.x[s0].text if s0 >= 0 else "__"),
 | 
				
			||||||
 | 
					                        "B0=", (example.x[b0].text if b0 >= 0 else "__"),
 | 
				
			||||||
 | 
					                        "S0 head?", str(state.has_head(state.S(0))),
 | 
				
			||||||
 | 
					                    )))
 | 
				
			||||||
 | 
					                    action.do(state.c, action.label)
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                failed = False
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					        if failed:
 | 
				
			||||||
 | 
					            print("Actions")
 | 
				
			||||||
 | 
					            for i in range(self.n_moves):
 | 
				
			||||||
 | 
					                print(self.get_class_name(i))
 | 
				
			||||||
 | 
					            print("Gold")
 | 
				
			||||||
 | 
					            for token in example.y:
 | 
				
			||||||
 | 
					                print(token.i, token.text, token.dep_, token.head.text)
 | 
				
			||||||
 | 
					            aligned_heads, aligned_labels = example.get_aligned_parse()
 | 
				
			||||||
 | 
					            print("Aligned heads")
 | 
				
			||||||
 | 
					            for i, head in enumerate(aligned_heads):
 | 
				
			||||||
 | 
					                print(example.x[i], example.x[head] if head is not None else "__")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            print("Predicted tokens")
 | 
				
			||||||
 | 
					            print([(w.i, w.text) for w in example.x])
 | 
				
			||||||
 | 
					            s0 = state.S(0)
 | 
				
			||||||
 | 
					            b0 = state.B(0)
 | 
				
			||||||
 | 
					            debug_log.append(" ".join((
 | 
				
			||||||
 | 
					                "?",
 | 
				
			||||||
 | 
					                "S0=", (example.x[s0].text if s0 >= 0 else "-"),
 | 
				
			||||||
 | 
					                "B0=", (example.x[b0].text if b0 >= 0 else "-"),
 | 
				
			||||||
 | 
					                "S0 head?", str(state.has_head(state.S(0))),
 | 
				
			||||||
 | 
					            )))
 | 
				
			||||||
 | 
					            s0 = state.S(0)
 | 
				
			||||||
 | 
					            b0 = state.B(0)
 | 
				
			||||||
 | 
					            print("\n".join(debug_log))
 | 
				
			||||||
 | 
					            print("Arc is gold B0, S0?", arc_is_gold(&gold.c, b0, s0))
 | 
				
			||||||
 | 
					            print("Arc is gold S0, B0?", arc_is_gold(&gold.c, s0, b0))
 | 
				
			||||||
 | 
					            print("is_head_unknown(s0)", is_head_unknown(&gold.c, s0))
 | 
				
			||||||
 | 
					            print("is_head_unknown(b0)", is_head_unknown(&gold.c, b0))
 | 
				
			||||||
 | 
					            print("b0", b0, "gold.heads[s0]", gold.c.heads[s0])
 | 
				
			||||||
 | 
					            print("Stack", [example.x[i] for i in state.stack])
 | 
				
			||||||
 | 
					            print("Buffer", [example.x[i] for i in state.queue])
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E024)
 | 
				
			||||||
 | 
					        return history
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from .transition_system cimport TransitionSystem
 | 
					from .transition_system cimport TransitionSystem
 | 
				
			||||||
from .transition_system cimport Transition
 | 
					from .transition_system cimport Transition
 | 
				
			||||||
from ..gold cimport GoldParseC
 | 
					 | 
				
			||||||
from ..typedefs cimport attr_t
 | 
					from ..typedefs cimport attr_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,16 @@
 | 
				
			||||||
from thinc.extra.search cimport Beam
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from collections import Counter
 | 
					from collections import Counter
 | 
				
			||||||
 | 
					from libc.stdint cimport int32_t
 | 
				
			||||||
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..typedefs cimport weight_t
 | 
					from ..typedefs cimport weight_t
 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
from ._state cimport StateC
 | 
					from ._state cimport StateC
 | 
				
			||||||
from .transition_system cimport Transition
 | 
					from .transition_system cimport Transition
 | 
				
			||||||
from .transition_system cimport do_func_t
 | 
					from .transition_system cimport do_func_t
 | 
				
			||||||
from ..gold cimport GoldParseC, GoldParse
 | 
					 | 
				
			||||||
from ..lexeme cimport Lexeme
 | 
					from ..lexeme cimport Lexeme
 | 
				
			||||||
from ..attrs cimport IS_SPACE
 | 
					from ..attrs cimport IS_SPACE
 | 
				
			||||||
 | 
					from ..gold.iob_utils import biluo_tags_from_offsets
 | 
				
			||||||
 | 
					from ..gold.example cimport Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,6 +36,43 @@ MOVE_NAMES[OUT] = 'O'
 | 
				
			||||||
MOVE_NAMES[ISNT] = 'x'
 | 
					MOVE_NAMES[ISNT] = 'x'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef struct GoldNERStateC:
 | 
				
			||||||
 | 
					    Transition* ner
 | 
				
			||||||
 | 
					    int32_t length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class BiluoGold:
 | 
				
			||||||
 | 
					    cdef Pool mem
 | 
				
			||||||
 | 
					    cdef GoldNERStateC c
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, BiluoPushDown moves, StateClass stcls, Example example):
 | 
				
			||||||
 | 
					        self.mem = Pool()
 | 
				
			||||||
 | 
					        self.c = create_gold_state(self.mem, moves, stcls, example)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def update(self, StateClass stcls):
 | 
				
			||||||
 | 
					        update_gold_state(&self.c, stcls)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef GoldNERStateC create_gold_state(
 | 
				
			||||||
 | 
					    Pool mem,
 | 
				
			||||||
 | 
					    BiluoPushDown moves,
 | 
				
			||||||
 | 
					    StateClass stcls,
 | 
				
			||||||
 | 
					    Example example
 | 
				
			||||||
 | 
					) except *:
 | 
				
			||||||
 | 
					    cdef GoldNERStateC gs
 | 
				
			||||||
 | 
					    gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
 | 
				
			||||||
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
 | 
					    for i, ner_tag in enumerate(ner_tags):
 | 
				
			||||||
 | 
					        gs.ner[i] = moves.lookup_transition(ner_tag)
 | 
				
			||||||
 | 
					    return gs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef void update_gold_state(GoldNERStateC* gs, StateClass stcls) except *:
 | 
				
			||||||
 | 
					    # We don't need to update each time, unlike the parser.
 | 
				
			||||||
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef do_func_t[N_MOVES] do_funcs
 | 
					cdef do_func_t[N_MOVES] do_funcs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -71,12 +109,12 @@ cdef class BiluoPushDown(TransitionSystem):
 | 
				
			||||||
            for action in (BEGIN, IN, LAST, UNIT):
 | 
					            for action in (BEGIN, IN, LAST, UNIT):
 | 
				
			||||||
                actions[action][entity_type] = 1
 | 
					                actions[action][entity_type] = 1
 | 
				
			||||||
        moves = ('M', 'B', 'I', 'L', 'U')
 | 
					        moves = ('M', 'B', 'I', 'L', 'U')
 | 
				
			||||||
        for example in kwargs.get('gold_parses', []):
 | 
					        for example in kwargs.get('examples', []):
 | 
				
			||||||
            for i, ner_tag in enumerate(example.token_annotation.entities):
 | 
					            for token in example.y:
 | 
				
			||||||
                if ner_tag != 'O' and ner_tag != '-':
 | 
					                ent_type = token.ent_type_
 | 
				
			||||||
                    _, label = ner_tag.split('-', 1)
 | 
					                if ent_type:
 | 
				
			||||||
                    for action in (BEGIN, IN, LAST, UNIT):
 | 
					                    for action in (BEGIN, IN, LAST, UNIT):
 | 
				
			||||||
                        actions[action][label] += 1
 | 
					                        actions[action][ent_type] += 1
 | 
				
			||||||
        return actions
 | 
					        return actions
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
| 
						 | 
					@ -91,52 +129,16 @@ cdef class BiluoPushDown(TransitionSystem):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            return MOVE_NAMES[move] + '-' + self.strings[label]
 | 
					            return MOVE_NAMES[move] + '-' + self.strings[label]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def has_gold(self, GoldParse gold, start=0, end=None):
 | 
					    def init_gold_batch(self, examples):
 | 
				
			||||||
        end = end or len(gold.ner)
 | 
					        all_states = self.init_batch([eg.predicted for eg in examples])
 | 
				
			||||||
        if all([tag in ('-', None) for tag in gold.ner[start:end]]):
 | 
					        golds = []
 | 
				
			||||||
            return False
 | 
					        states = []
 | 
				
			||||||
        else:
 | 
					        for state, eg in zip(all_states, examples):
 | 
				
			||||||
            return True
 | 
					            if self.has_gold(eg) and not state.is_final():
 | 
				
			||||||
 | 
					                golds.append(self.init_gold(state, eg))
 | 
				
			||||||
    def preprocess_gold(self, GoldParse gold):
 | 
					                states.append(state)
 | 
				
			||||||
        if not self.has_gold(gold):
 | 
					        n_steps = sum([len(s.queue) for s in states])
 | 
				
			||||||
            return None
 | 
					        return states, golds, n_steps
 | 
				
			||||||
        for i in range(gold.length):
 | 
					 | 
				
			||||||
            gold.c.ner[i] = self.lookup_transition(gold.ner[i])
 | 
					 | 
				
			||||||
        return gold
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_beam_annot(self, Beam beam):
 | 
					 | 
				
			||||||
        entities = {}
 | 
					 | 
				
			||||||
        probs = beam.probs
 | 
					 | 
				
			||||||
        for i in range(beam.size):
 | 
					 | 
				
			||||||
            state = <StateC*>beam.at(i)
 | 
					 | 
				
			||||||
            if state.is_final():
 | 
					 | 
				
			||||||
                self.finalize_state(state)
 | 
					 | 
				
			||||||
                prob = probs[i]
 | 
					 | 
				
			||||||
                for j in range(state._e_i):
 | 
					 | 
				
			||||||
                    start = state._ents[j].start
 | 
					 | 
				
			||||||
                    end = state._ents[j].end
 | 
					 | 
				
			||||||
                    label = state._ents[j].label
 | 
					 | 
				
			||||||
                    entities.setdefault((start, end, label), 0.0)
 | 
					 | 
				
			||||||
                    entities[(start, end, label)] += prob
 | 
					 | 
				
			||||||
        return entities
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_beam_parses(self, Beam beam):
 | 
					 | 
				
			||||||
        parses = []
 | 
					 | 
				
			||||||
        probs = beam.probs
 | 
					 | 
				
			||||||
        for i in range(beam.size):
 | 
					 | 
				
			||||||
            state = <StateC*>beam.at(i)
 | 
					 | 
				
			||||||
            if state.is_final():
 | 
					 | 
				
			||||||
                self.finalize_state(state)
 | 
					 | 
				
			||||||
                prob = probs[i]
 | 
					 | 
				
			||||||
                parse = []
 | 
					 | 
				
			||||||
                for j in range(state._e_i):
 | 
					 | 
				
			||||||
                    start = state._ents[j].start
 | 
					 | 
				
			||||||
                    end = state._ents[j].end
 | 
					 | 
				
			||||||
                    label = state._ents[j].label
 | 
					 | 
				
			||||||
                    parse.append((start, end, self.strings[label]))
 | 
					 | 
				
			||||||
                parses.append((prob, parse))
 | 
					 | 
				
			||||||
        return parses
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef Transition lookup_transition(self, object name) except *:
 | 
					    cdef Transition lookup_transition(self, object name) except *:
 | 
				
			||||||
        cdef attr_t label
 | 
					        cdef attr_t label
 | 
				
			||||||
| 
						 | 
					@ -237,6 +239,47 @@ cdef class BiluoPushDown(TransitionSystem):
 | 
				
			||||||
                    self.add_action(UNIT, st._sent[i].ent_type)
 | 
					                    self.add_action(UNIT, st._sent[i].ent_type)
 | 
				
			||||||
                    self.add_action(LAST, st._sent[i].ent_type)
 | 
					                    self.add_action(LAST, st._sent[i].ent_type)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def init_gold(self, StateClass state, Example example):
 | 
				
			||||||
 | 
					        return BiluoGold(self, state, example)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def has_gold(self, Example eg, start=0, end=None):
 | 
				
			||||||
 | 
					        for word in eg.y[start:end]:
 | 
				
			||||||
 | 
					            if word.ent_iob != 0:
 | 
				
			||||||
 | 
					                return True
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_cost(self, StateClass stcls, gold, int i):
 | 
				
			||||||
 | 
					        if not isinstance(gold, BiluoGold):
 | 
				
			||||||
 | 
					            raise TypeError("Expected BiluoGold")
 | 
				
			||||||
 | 
					        cdef BiluoGold gold_ = gold
 | 
				
			||||||
 | 
					        gold_state = gold_.c
 | 
				
			||||||
 | 
					        n_gold = 0
 | 
				
			||||||
 | 
					        if self.c[i].is_valid(stcls.c, self.c[i].label):
 | 
				
			||||||
 | 
					            cost = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            cost = 9000
 | 
				
			||||||
 | 
					        return cost
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int set_costs(self, int* is_valid, weight_t* costs,
 | 
				
			||||||
 | 
					                       StateClass stcls, gold) except -1:
 | 
				
			||||||
 | 
					        if not isinstance(gold, BiluoGold):
 | 
				
			||||||
 | 
					            raise TypeError("Expected BiluoGold")
 | 
				
			||||||
 | 
					        cdef BiluoGold gold_ = gold
 | 
				
			||||||
 | 
					        gold_.update(stcls)
 | 
				
			||||||
 | 
					        gold_state = gold_.c
 | 
				
			||||||
 | 
					        n_gold = 0
 | 
				
			||||||
 | 
					        for i in range(self.n_moves):
 | 
				
			||||||
 | 
					            if self.c[i].is_valid(stcls.c, self.c[i].label):
 | 
				
			||||||
 | 
					                is_valid[i] = 1
 | 
				
			||||||
 | 
					                costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
 | 
				
			||||||
 | 
					                n_gold += costs[i] <= 0
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                is_valid[i] = 0
 | 
				
			||||||
 | 
					                costs[i] = 9000
 | 
				
			||||||
 | 
					        if n_gold < 1:
 | 
				
			||||||
 | 
					            raise ValueError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Missing:
 | 
					cdef class Missing:
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
| 
						 | 
					@ -248,7 +291,7 @@ cdef class Missing:
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
        return 9000
 | 
					        return 9000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -300,7 +343,8 @@ cdef class Begin:
 | 
				
			||||||
        st.pop()
 | 
					        st.pop()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <GoldNERStateC*>_gold
 | 
				
			||||||
        cdef int g_act = gold.ner[s.B(0)].move
 | 
					        cdef int g_act = gold.ner[s.B(0)].move
 | 
				
			||||||
        cdef attr_t g_tag = gold.ner[s.B(0)].label
 | 
					        cdef attr_t g_tag = gold.ner[s.B(0)].label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -363,7 +407,8 @@ cdef class In:
 | 
				
			||||||
        st.pop()
 | 
					        st.pop()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <GoldNERStateC*>_gold
 | 
				
			||||||
        move = IN
 | 
					        move = IN
 | 
				
			||||||
        cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
 | 
					        cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
 | 
				
			||||||
        cdef int g_act = gold.ner[s.B(0)].move
 | 
					        cdef int g_act = gold.ner[s.B(0)].move
 | 
				
			||||||
| 
						 | 
					@ -429,7 +474,8 @@ cdef class Last:
 | 
				
			||||||
        st.pop()
 | 
					        st.pop()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <GoldNERStateC*>_gold
 | 
				
			||||||
        move = LAST
 | 
					        move = LAST
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef int g_act = gold.ner[s.B(0)].move
 | 
					        cdef int g_act = gold.ner[s.B(0)].move
 | 
				
			||||||
| 
						 | 
					@ -497,7 +543,8 @@ cdef class Unit:
 | 
				
			||||||
        st.pop()
 | 
					        st.pop()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <GoldNERStateC*>_gold
 | 
				
			||||||
        cdef int g_act = gold.ner[s.B(0)].move
 | 
					        cdef int g_act = gold.ner[s.B(0)].move
 | 
				
			||||||
        cdef attr_t g_tag = gold.ner[s.B(0)].label
 | 
					        cdef attr_t g_tag = gold.ner[s.B(0)].label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -537,7 +584,8 @@ cdef class Out:
 | 
				
			||||||
        st.pop()
 | 
					        st.pop()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
 | 
					    cdef weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
 | 
				
			||||||
 | 
					        gold = <GoldNERStateC*>_gold
 | 
				
			||||||
        cdef int g_act = gold.ner[s.B(0)].move
 | 
					        cdef int g_act = gold.ner[s.B(0)].move
 | 
				
			||||||
        cdef attr_t g_tag = gold.ner[s.B(0)].label
 | 
					        cdef attr_t g_tag = gold.ner[s.B(0)].label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,6 @@ from libcpp.vector cimport vector
 | 
				
			||||||
from libc.string cimport memset, memcpy
 | 
					from libc.string cimport memset, memcpy
 | 
				
			||||||
from libc.stdlib cimport calloc, free
 | 
					from libc.stdlib cimport calloc, free
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from thinc.extra.search cimport Beam
 | 
					 | 
				
			||||||
from thinc.backends.linalg cimport Vec, VecVec
 | 
					from thinc.backends.linalg cimport Vec, VecVec
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops
 | 
					from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops
 | 
				
			||||||
| 
						 | 
					@ -21,7 +20,6 @@ import numpy
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokens.doc cimport Doc
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
from ..gold cimport GoldParse
 | 
					 | 
				
			||||||
from ..typedefs cimport weight_t, class_t, hash_t
 | 
					from ..typedefs cimport weight_t, class_t, hash_t
 | 
				
			||||||
from ._parser_model cimport alloc_activations, free_activations
 | 
					from ._parser_model cimport alloc_activations, free_activations
 | 
				
			||||||
from ._parser_model cimport predict_states, arg_max_if_valid
 | 
					from ._parser_model cimport predict_states, arg_max_if_valid
 | 
				
			||||||
| 
						 | 
					@ -30,14 +28,12 @@ from ._parser_model cimport get_c_weights, get_c_sizes
 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
from ._state cimport StateC
 | 
					from ._state cimport StateC
 | 
				
			||||||
from .transition_system cimport Transition
 | 
					from .transition_system cimport Transition
 | 
				
			||||||
from . cimport _beam_utils
 | 
					from ..gold.example cimport Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..gold import Example
 | 
					 | 
				
			||||||
from ..util import link_vectors_to_models, create_default_optimizer, registry
 | 
					from ..util import link_vectors_to_models, create_default_optimizer, registry
 | 
				
			||||||
from ..compat import copy_array
 | 
					from ..compat import copy_array
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from . import _beam_utils
 | 
					 | 
				
			||||||
from . import nonproj
 | 
					from . import nonproj
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -144,71 +140,46 @@ cdef class Parser:
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def preprocess_gold(self, examples):
 | 
					 | 
				
			||||||
        for ex in examples:
 | 
					 | 
				
			||||||
            yield ex
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def use_params(self, params):
 | 
					    def use_params(self, params):
 | 
				
			||||||
        # Can't decorate cdef class :(. Workaround.
 | 
					        # Can't decorate cdef class :(. Workaround.
 | 
				
			||||||
        with self.model.use_params(params):
 | 
					        with self.model.use_params(params):
 | 
				
			||||||
            yield
 | 
					            yield
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, Doc doc, beam_width=None):
 | 
					    def __call__(self, Doc doc):
 | 
				
			||||||
        """Apply the parser or entity recognizer, setting the annotations onto
 | 
					        """Apply the parser or entity recognizer, setting the annotations onto
 | 
				
			||||||
        the `Doc` object.
 | 
					        the `Doc` object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        doc (Doc): The document to be processed.
 | 
					        doc (Doc): The document to be processed.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if beam_width is None:
 | 
					        states = self.predict([doc])
 | 
				
			||||||
            beam_width = self.cfg['beam_width']
 | 
					 | 
				
			||||||
        beam_density = self.cfg.get('beam_density', 0.)
 | 
					 | 
				
			||||||
        states = self.predict([doc], beam_width=beam_width,
 | 
					 | 
				
			||||||
                              beam_density=beam_density)
 | 
					 | 
				
			||||||
        self.set_annotations([doc], states, tensors=None)
 | 
					        self.set_annotations([doc], states, tensors=None)
 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None,
 | 
					    def pipe(self, docs, int batch_size=256, int n_threads=-1):
 | 
				
			||||||
             as_example=False):
 | 
					 | 
				
			||||||
        """Process a stream of documents.
 | 
					        """Process a stream of documents.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        stream: The sequence of documents to process.
 | 
					        stream: The sequence of documents to process.
 | 
				
			||||||
        batch_size (int): Number of documents to accumulate into a working set.
 | 
					        batch_size (int): Number of documents to accumulate into a working set.
 | 
				
			||||||
        YIELDS (Doc): Documents, in order.
 | 
					        YIELDS (Doc): Documents, in order.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if beam_width is None:
 | 
					 | 
				
			||||||
            beam_width = self.cfg['beam_width']
 | 
					 | 
				
			||||||
        beam_density = self.cfg.get('beam_density', 0.)
 | 
					 | 
				
			||||||
        cdef Doc doc
 | 
					        cdef Doc doc
 | 
				
			||||||
        for batch in util.minibatch(docs, size=batch_size):
 | 
					        for batch in util.minibatch(docs, size=batch_size):
 | 
				
			||||||
            batch_in_order = list(batch)
 | 
					            batch_in_order = list(batch)
 | 
				
			||||||
            docs = [self._get_doc(ex) for ex in batch_in_order]
 | 
					            by_length = sorted(batch, key=lambda doc: len(doc))
 | 
				
			||||||
            by_length = sorted(docs, key=lambda doc: len(doc))
 | 
					 | 
				
			||||||
            for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
 | 
					            for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
 | 
				
			||||||
                subbatch = list(subbatch)
 | 
					                subbatch = list(subbatch)
 | 
				
			||||||
                parse_states = self.predict(subbatch, beam_width=beam_width,
 | 
					                parse_states = self.predict(subbatch)
 | 
				
			||||||
                                            beam_density=beam_density)
 | 
					 | 
				
			||||||
                self.set_annotations(subbatch, parse_states, tensors=None)
 | 
					                self.set_annotations(subbatch, parse_states, tensors=None)
 | 
				
			||||||
            if as_example:
 | 
					            yield from batch_in_order
 | 
				
			||||||
                annotated_examples = []
 | 
					 | 
				
			||||||
                for ex, doc in zip(batch_in_order, docs):
 | 
					 | 
				
			||||||
                    ex.doc = doc
 | 
					 | 
				
			||||||
                    annotated_examples.append(ex)
 | 
					 | 
				
			||||||
                yield from annotated_examples
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                yield from batch_in_order
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.):
 | 
					    def predict(self, docs):
 | 
				
			||||||
        if isinstance(docs, Doc):
 | 
					        if isinstance(docs, Doc):
 | 
				
			||||||
            docs = [docs]
 | 
					            docs = [docs]
 | 
				
			||||||
        if not any(len(doc) for doc in docs):
 | 
					        if not any(len(doc) for doc in docs):
 | 
				
			||||||
            result = self.moves.init_batch(docs)
 | 
					            result = self.moves.init_batch(docs)
 | 
				
			||||||
            self._resize()
 | 
					            self._resize()
 | 
				
			||||||
            return result
 | 
					            return result
 | 
				
			||||||
        if beam_width < 2:
 | 
					        return self.greedy_parse(docs, drop=0.0)
 | 
				
			||||||
            return self.greedy_parse(docs, drop=drop)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return self.beam_parse(docs, beam_width=beam_width,
 | 
					 | 
				
			||||||
                                   beam_density=beam_density, drop=drop)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def greedy_parse(self, docs, drop=0.):
 | 
					    def greedy_parse(self, docs, drop=0.):
 | 
				
			||||||
        cdef vector[StateC*] states
 | 
					        cdef vector[StateC*] states
 | 
				
			||||||
| 
						 | 
					@ -230,44 +201,6 @@ cdef class Parser:
 | 
				
			||||||
                weights, sizes)
 | 
					                weights, sizes)
 | 
				
			||||||
        return batch
 | 
					        return batch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
 | 
					 | 
				
			||||||
        cdef Beam beam
 | 
					 | 
				
			||||||
        cdef Doc doc
 | 
					 | 
				
			||||||
        cdef np.ndarray token_ids
 | 
					 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					 | 
				
			||||||
        beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density)
 | 
					 | 
				
			||||||
        # This is pretty dirty, but the NER can resize itself in init_batch,
 | 
					 | 
				
			||||||
        # if labels are missing. We therefore have to check whether we need to
 | 
					 | 
				
			||||||
        # expand our model output.
 | 
					 | 
				
			||||||
        self._resize()
 | 
					 | 
				
			||||||
        cdef int nr_feature = self.model.get_ref("lower").get_dim("nF")
 | 
					 | 
				
			||||||
        model = self.model.predict(docs)
 | 
					 | 
				
			||||||
        token_ids = numpy.zeros((len(docs) * beam_width, nr_feature),
 | 
					 | 
				
			||||||
                                 dtype='i', order='C')
 | 
					 | 
				
			||||||
        cdef int* c_ids
 | 
					 | 
				
			||||||
        cdef int n_states
 | 
					 | 
				
			||||||
        model = self.model.predict(docs)
 | 
					 | 
				
			||||||
        todo = [beam for beam in beams if not beam.is_done]
 | 
					 | 
				
			||||||
        while todo:
 | 
					 | 
				
			||||||
            token_ids.fill(-1)
 | 
					 | 
				
			||||||
            c_ids = <int*>token_ids.data
 | 
					 | 
				
			||||||
            n_states = 0
 | 
					 | 
				
			||||||
            for beam in todo:
 | 
					 | 
				
			||||||
                for i in range(beam.size):
 | 
					 | 
				
			||||||
                    state = <StateC*>beam.at(i)
 | 
					 | 
				
			||||||
                    # This way we avoid having to score finalized states
 | 
					 | 
				
			||||||
                    # We do have to take care to keep indexes aligned, though
 | 
					 | 
				
			||||||
                    if not state.is_final():
 | 
					 | 
				
			||||||
                        state.set_context_tokens(c_ids, nr_feature)
 | 
					 | 
				
			||||||
                        c_ids += nr_feature
 | 
					 | 
				
			||||||
                        n_states += 1
 | 
					 | 
				
			||||||
            if n_states == 0:
 | 
					 | 
				
			||||||
                break
 | 
					 | 
				
			||||||
            vectors = model.state2vec.predict(token_ids[:n_states])
 | 
					 | 
				
			||||||
            scores = model.vec2scores.predict(vectors)
 | 
					 | 
				
			||||||
            todo = self.transition_beams(todo, scores)
 | 
					 | 
				
			||||||
        return beams
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef void _parseC(self, StateC** states,
 | 
					    cdef void _parseC(self, StateC** states,
 | 
				
			||||||
            WeightsC weights, SizesC sizes) nogil:
 | 
					            WeightsC weights, SizesC sizes) nogil:
 | 
				
			||||||
        cdef int i, j
 | 
					        cdef int i, j
 | 
				
			||||||
| 
						 | 
					@ -288,20 +221,9 @@ cdef class Parser:
 | 
				
			||||||
            unfinished.clear()
 | 
					            unfinished.clear()
 | 
				
			||||||
        free_activations(&activations)
 | 
					        free_activations(&activations)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def set_annotations(self, docs, states_or_beams, tensors=None):
 | 
					    def set_annotations(self, docs, states, tensors=None):
 | 
				
			||||||
        cdef StateClass state
 | 
					        cdef StateClass state
 | 
				
			||||||
        cdef Beam beam
 | 
					 | 
				
			||||||
        cdef Doc doc
 | 
					        cdef Doc doc
 | 
				
			||||||
        states = []
 | 
					 | 
				
			||||||
        beams = []
 | 
					 | 
				
			||||||
        for state_or_beam in states_or_beams:
 | 
					 | 
				
			||||||
            if isinstance(state_or_beam, StateClass):
 | 
					 | 
				
			||||||
                states.append(state_or_beam)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                beam = state_or_beam
 | 
					 | 
				
			||||||
                state = StateClass.borrow(<StateC*>beam.at(0))
 | 
					 | 
				
			||||||
                states.append(state)
 | 
					 | 
				
			||||||
                beams.append(beam)
 | 
					 | 
				
			||||||
        for i, (state, doc) in enumerate(zip(states, docs)):
 | 
					        for i, (state, doc) in enumerate(zip(states, docs)):
 | 
				
			||||||
            self.moves.finalize_state(state.c)
 | 
					            self.moves.finalize_state(state.c)
 | 
				
			||||||
            for j in range(doc.length):
 | 
					            for j in range(doc.length):
 | 
				
			||||||
| 
						 | 
					@ -309,8 +231,6 @@ cdef class Parser:
 | 
				
			||||||
            self.moves.finalize_doc(doc)
 | 
					            self.moves.finalize_doc(doc)
 | 
				
			||||||
            for hook in self.postprocesses:
 | 
					            for hook in self.postprocesses:
 | 
				
			||||||
                hook(doc)
 | 
					                hook(doc)
 | 
				
			||||||
        for beam in beams:
 | 
					 | 
				
			||||||
            _beam_utils.cleanup_beam(beam)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def transition_states(self, states, float[:, ::1] scores):
 | 
					    def transition_states(self, states, float[:, ::1] scores):
 | 
				
			||||||
        cdef StateClass state
 | 
					        cdef StateClass state
 | 
				
			||||||
| 
						 | 
					@ -342,50 +262,25 @@ cdef class Parser:
 | 
				
			||||||
                states[i].push_hist(guess)
 | 
					                states[i].push_hist(guess)
 | 
				
			||||||
        free(is_valid)
 | 
					        free(is_valid)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def transition_beams(self, beams, float[:, ::1] scores):
 | 
					 | 
				
			||||||
        cdef Beam beam
 | 
					 | 
				
			||||||
        cdef float* c_scores = &scores[0, 0]
 | 
					 | 
				
			||||||
        for beam in beams:
 | 
					 | 
				
			||||||
            for i in range(beam.size):
 | 
					 | 
				
			||||||
                state = <StateC*>beam.at(i)
 | 
					 | 
				
			||||||
                if not state.is_final():
 | 
					 | 
				
			||||||
                    self.moves.set_valid(beam.is_valid[i], state)
 | 
					 | 
				
			||||||
                    memcpy(beam.scores[i], c_scores, scores.shape[1] * sizeof(float))
 | 
					 | 
				
			||||||
                    c_scores += scores.shape[1]
 | 
					 | 
				
			||||||
            beam.advance(_beam_utils.transition_state, _beam_utils.hash_state, <void*>self.moves.c)
 | 
					 | 
				
			||||||
            beam.check_done(_beam_utils.check_final_state, NULL)
 | 
					 | 
				
			||||||
        return [b for b in beams if not b.is_done]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
 | 
					    def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if losses is None:
 | 
					        if losses is None:
 | 
				
			||||||
            losses = {}
 | 
					            losses = {}
 | 
				
			||||||
        losses.setdefault(self.name, 0.)
 | 
					        losses.setdefault(self.name, 0.)
 | 
				
			||||||
        for multitask in self._multitasks:
 | 
					        for multitask in self._multitasks:
 | 
				
			||||||
            multitask.update(examples, drop=drop, sgd=sgd)
 | 
					            multitask.update(examples, drop=drop, sgd=sgd)
 | 
				
			||||||
        # The probability we use beam update, instead of falling back to
 | 
					 | 
				
			||||||
        # a greedy update
 | 
					 | 
				
			||||||
        beam_update_prob = self.cfg['beam_update_prob']
 | 
					 | 
				
			||||||
        if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
 | 
					 | 
				
			||||||
            return self.update_beam(examples, self.cfg['beam_width'],
 | 
					 | 
				
			||||||
                    drop=drop, sgd=sgd, losses=losses, set_annotations=set_annotations,
 | 
					 | 
				
			||||||
                    beam_density=self.cfg.get('beam_density', 0.001))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					        set_dropout_rate(self.model, drop)
 | 
				
			||||||
        cut_gold = True
 | 
					 | 
				
			||||||
        if cut_gold:
 | 
					 | 
				
			||||||
            # Chop sequences into lengths of this many transitions, to make the
 | 
					 | 
				
			||||||
            # batch uniform length.
 | 
					 | 
				
			||||||
            cut_gold = numpy.random.choice(range(20, 100))
 | 
					 | 
				
			||||||
            states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            states, golds, max_steps = self._init_gold_batch_no_cut(examples)
 | 
					 | 
				
			||||||
        states_golds = [(s, g) for (s, g) in zip(states, golds)
 | 
					 | 
				
			||||||
                        if not s.is_final() and g is not None]
 | 
					 | 
				
			||||||
        # Prepare the stepwise model, and get the callback for finishing the batch
 | 
					        # Prepare the stepwise model, and get the callback for finishing the batch
 | 
				
			||||||
        model, backprop_tok2vec = self.model.begin_update([ex.doc for ex in examples])
 | 
					        model, backprop_tok2vec = self.model.begin_update(
 | 
				
			||||||
 | 
					            [eg.predicted for eg in examples])
 | 
				
			||||||
 | 
					        # Chop sequences into lengths of this many transitions, to make the
 | 
				
			||||||
 | 
					        # batch uniform length. We randomize this to overfit less.
 | 
				
			||||||
 | 
					        cut_gold = numpy.random.choice(range(20, 100))
 | 
				
			||||||
 | 
					        states, golds, max_steps = self._init_gold_batch(
 | 
				
			||||||
 | 
					            examples,
 | 
				
			||||||
 | 
					            max_length=cut_gold
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        all_states = list(states)
 | 
					        all_states = list(states)
 | 
				
			||||||
 | 
					        states_golds = zip(states, golds)
 | 
				
			||||||
        for _ in range(max_steps):
 | 
					        for _ in range(max_steps):
 | 
				
			||||||
            if not states_golds:
 | 
					            if not states_golds:
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
| 
						 | 
					@ -395,18 +290,18 @@ cdef class Parser:
 | 
				
			||||||
            backprop(d_scores)
 | 
					            backprop(d_scores)
 | 
				
			||||||
            # Follow the predicted action
 | 
					            # Follow the predicted action
 | 
				
			||||||
            self.transition_states(states, scores)
 | 
					            self.transition_states(states, scores)
 | 
				
			||||||
            states_golds = [eg for eg in states_golds if not eg[0].is_final()]
 | 
					            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        backprop_tok2vec(golds)
 | 
					        backprop_tok2vec(golds)
 | 
				
			||||||
        if sgd is not None:
 | 
					        if sgd not in (None, False):
 | 
				
			||||||
            self.model.finish_update(sgd)
 | 
					            self.model.finish_update(sgd)
 | 
				
			||||||
        if set_annotations:
 | 
					        if set_annotations:
 | 
				
			||||||
            docs = [ex.doc for ex in examples]
 | 
					            docs = [eg.predicted for eg in examples]
 | 
				
			||||||
            self.set_annotations(docs, all_states)
 | 
					            self.set_annotations(docs, all_states)
 | 
				
			||||||
        return losses
 | 
					        return losses
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def rehearse(self, examples, sgd=None, losses=None, **cfg):
 | 
					    def rehearse(self, examples, sgd=None, losses=None, **cfg):
 | 
				
			||||||
        """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
 | 
					        """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					 | 
				
			||||||
        if losses is None:
 | 
					        if losses is None:
 | 
				
			||||||
            losses = {}
 | 
					            losses = {}
 | 
				
			||||||
        for multitask in self._multitasks:
 | 
					        for multitask in self._multitasks:
 | 
				
			||||||
| 
						 | 
					@ -416,7 +311,7 @@ cdef class Parser:
 | 
				
			||||||
            return None
 | 
					            return None
 | 
				
			||||||
        losses.setdefault(self.name, 0.)
 | 
					        losses.setdefault(self.name, 0.)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        docs = [ex.doc for ex in examples]
 | 
					        docs = [eg.predicted for eg in examples]
 | 
				
			||||||
        states = self.moves.init_batch(docs)
 | 
					        states = self.moves.init_batch(docs)
 | 
				
			||||||
        # This is pretty dirty, but the NER can resize itself in init_batch,
 | 
					        # This is pretty dirty, but the NER can resize itself in init_batch,
 | 
				
			||||||
        # if labels are missing. We therefore have to check whether we need to
 | 
					        # if labels are missing. We therefore have to check whether we need to
 | 
				
			||||||
| 
						 | 
					@ -448,52 +343,6 @@ cdef class Parser:
 | 
				
			||||||
        losses[self.name] += loss / n_scores
 | 
					        losses[self.name] += loss / n_scores
 | 
				
			||||||
        return losses
 | 
					        return losses
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def update_beam(self, examples, width, drop=0., sgd=None, losses=None,
 | 
					 | 
				
			||||||
                    set_annotations=False, beam_density=0.0):
 | 
					 | 
				
			||||||
        examples = Example.to_example_objects(examples)
 | 
					 | 
				
			||||||
        docs = [ex.doc for ex in examples]
 | 
					 | 
				
			||||||
        golds = [ex.gold for ex in examples]
 | 
					 | 
				
			||||||
        new_golds = []
 | 
					 | 
				
			||||||
        lengths = [len(d) for d in docs]
 | 
					 | 
				
			||||||
        states = self.moves.init_batch(docs)
 | 
					 | 
				
			||||||
        for gold in golds:
 | 
					 | 
				
			||||||
            self.moves.preprocess_gold(gold)
 | 
					 | 
				
			||||||
            new_golds.append(gold)
 | 
					 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					 | 
				
			||||||
        model, backprop_tok2vec = self.model.begin_update(docs)
 | 
					 | 
				
			||||||
        states_d_scores, backprops, beams = _beam_utils.update_beam(
 | 
					 | 
				
			||||||
            self.moves,
 | 
					 | 
				
			||||||
            self.model.get_ref("lower").get_dim("nF"),
 | 
					 | 
				
			||||||
            10000,
 | 
					 | 
				
			||||||
            states,
 | 
					 | 
				
			||||||
            golds,
 | 
					 | 
				
			||||||
            model.state2vec,
 | 
					 | 
				
			||||||
            model.vec2scores,
 | 
					 | 
				
			||||||
            width,
 | 
					 | 
				
			||||||
            losses=losses,
 | 
					 | 
				
			||||||
            beam_density=beam_density
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        for i, d_scores in enumerate(states_d_scores):
 | 
					 | 
				
			||||||
            losses[self.name] += (d_scores**2).mean()
 | 
					 | 
				
			||||||
            ids, bp_vectors, bp_scores = backprops[i]
 | 
					 | 
				
			||||||
            d_vector = bp_scores(d_scores)
 | 
					 | 
				
			||||||
            if isinstance(model.ops, CupyOps) \
 | 
					 | 
				
			||||||
            and not isinstance(ids, model.state2vec.ops.xp.ndarray):
 | 
					 | 
				
			||||||
                model.backprops.append((
 | 
					 | 
				
			||||||
                    util.get_async(model.cuda_stream, ids),
 | 
					 | 
				
			||||||
                    util.get_async(model.cuda_stream, d_vector),
 | 
					 | 
				
			||||||
                    bp_vectors))
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                model.backprops.append((ids, d_vector, bp_vectors))
 | 
					 | 
				
			||||||
        backprop_tok2vec(golds)
 | 
					 | 
				
			||||||
        if sgd is not None:
 | 
					 | 
				
			||||||
            self.model.finish_update(sgd)
 | 
					 | 
				
			||||||
        if set_annotations:
 | 
					 | 
				
			||||||
            self.set_annotations(docs, beams)
 | 
					 | 
				
			||||||
        cdef Beam beam
 | 
					 | 
				
			||||||
        for beam in beams:
 | 
					 | 
				
			||||||
            _beam_utils.cleanup_beam(beam)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_gradients(self):
 | 
					    def get_gradients(self):
 | 
				
			||||||
        """Get non-zero gradients of the model's parameters, as a dictionary
 | 
					        """Get non-zero gradients of the model's parameters, as a dictionary
 | 
				
			||||||
        keyed by the parameter ID. The values are (weights, gradients) tuples.
 | 
					        keyed by the parameter ID. The values are (weights, gradients) tuples.
 | 
				
			||||||
| 
						 | 
					@ -511,66 +360,8 @@ cdef class Parser:
 | 
				
			||||||
                queue.extend(node._layers)
 | 
					                queue.extend(node._layers)
 | 
				
			||||||
        return gradients
 | 
					        return gradients
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _init_gold_batch_no_cut(self, whole_examples):
 | 
					 | 
				
			||||||
        states = self.moves.init_batch([eg.doc for eg in whole_examples])
 | 
					 | 
				
			||||||
        good_docs = []
 | 
					 | 
				
			||||||
        good_golds = []
 | 
					 | 
				
			||||||
        good_states = []
 | 
					 | 
				
			||||||
        for i, eg in enumerate(whole_examples):
 | 
					 | 
				
			||||||
            doc = eg.doc
 | 
					 | 
				
			||||||
            gold = self.moves.preprocess_gold(eg.gold)
 | 
					 | 
				
			||||||
            if gold is not None and self.moves.has_gold(gold):
 | 
					 | 
				
			||||||
                good_docs.append(doc)
 | 
					 | 
				
			||||||
                good_golds.append(gold)
 | 
					 | 
				
			||||||
                good_states.append(states[i])
 | 
					 | 
				
			||||||
        n_moves = []
 | 
					 | 
				
			||||||
        for doc, gold in zip(good_docs, good_golds):
 | 
					 | 
				
			||||||
            oracle_actions = self.moves.get_oracle_sequence(doc, gold)
 | 
					 | 
				
			||||||
            n_moves.append(len(oracle_actions))
 | 
					 | 
				
			||||||
        return good_states, good_golds, max(n_moves, default=0) * 2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _init_gold_batch(self, whole_examples, min_length=5, max_length=500):
 | 
					 | 
				
			||||||
        """Make a square batch, of length equal to the shortest doc. A long
 | 
					 | 
				
			||||||
        doc will get multiple states. Let's say we have a doc of length 2*N,
 | 
					 | 
				
			||||||
        where N is the shortest doc. We'll make two states, one representing
 | 
					 | 
				
			||||||
        long_doc[:N], and another representing long_doc[N:]."""
 | 
					 | 
				
			||||||
        cdef:
 | 
					 | 
				
			||||||
            StateClass state
 | 
					 | 
				
			||||||
            Transition action
 | 
					 | 
				
			||||||
        whole_docs = [ex.doc for ex in whole_examples]
 | 
					 | 
				
			||||||
        whole_golds = [ex.gold for ex in whole_examples]
 | 
					 | 
				
			||||||
        whole_states = self.moves.init_batch(whole_docs)
 | 
					 | 
				
			||||||
        max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
 | 
					 | 
				
			||||||
        max_moves = 0
 | 
					 | 
				
			||||||
        states = []
 | 
					 | 
				
			||||||
        golds = []
 | 
					 | 
				
			||||||
        for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
 | 
					 | 
				
			||||||
            gold = self.moves.preprocess_gold(gold)
 | 
					 | 
				
			||||||
            if gold is None:
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            oracle_actions = self.moves.get_oracle_sequence(doc, gold)
 | 
					 | 
				
			||||||
            start = 0
 | 
					 | 
				
			||||||
            while start < len(doc):
 | 
					 | 
				
			||||||
                state = state.copy()
 | 
					 | 
				
			||||||
                n_moves = 0
 | 
					 | 
				
			||||||
                while state.B(0) < start and not state.is_final():
 | 
					 | 
				
			||||||
                    action = self.moves.c[oracle_actions.pop(0)]
 | 
					 | 
				
			||||||
                    action.do(state.c, action.label)
 | 
					 | 
				
			||||||
                    state.c.push_hist(action.clas)
 | 
					 | 
				
			||||||
                    n_moves += 1
 | 
					 | 
				
			||||||
                has_gold = self.moves.has_gold(gold, start=start,
 | 
					 | 
				
			||||||
                                               end=start+max_length)
 | 
					 | 
				
			||||||
                if not state.is_final() and has_gold:
 | 
					 | 
				
			||||||
                    states.append(state)
 | 
					 | 
				
			||||||
                    golds.append(gold)
 | 
					 | 
				
			||||||
                    max_moves = max(max_moves, n_moves)
 | 
					 | 
				
			||||||
                start += min(max_length, len(doc)-start)
 | 
					 | 
				
			||||||
            max_moves = max(max_moves, len(oracle_actions))
 | 
					 | 
				
			||||||
        return states, golds, max_moves
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
 | 
					    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
 | 
				
			||||||
        cdef StateClass state
 | 
					        cdef StateClass state
 | 
				
			||||||
        cdef GoldParse gold
 | 
					 | 
				
			||||||
        cdef Pool mem = Pool()
 | 
					        cdef Pool mem = Pool()
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -613,9 +404,11 @@ cdef class Parser:
 | 
				
			||||||
        if not hasattr(get_examples, '__call__'):
 | 
					        if not hasattr(get_examples, '__call__'):
 | 
				
			||||||
            gold_tuples = get_examples
 | 
					            gold_tuples = get_examples
 | 
				
			||||||
            get_examples = lambda: gold_tuples
 | 
					            get_examples = lambda: gold_tuples
 | 
				
			||||||
        actions = self.moves.get_actions(gold_parses=get_examples(),
 | 
					        actions = self.moves.get_actions(
 | 
				
			||||||
                                         min_freq=self.cfg['min_action_freq'],
 | 
					            examples=get_examples(),
 | 
				
			||||||
                                         learn_tokens=self.cfg["learn_tokens"])
 | 
					            min_freq=self.cfg['min_action_freq'],
 | 
				
			||||||
 | 
					            learn_tokens=self.cfg["learn_tokens"]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        for action, labels in self.moves.labels.items():
 | 
					        for action, labels in self.moves.labels.items():
 | 
				
			||||||
            actions.setdefault(action, {})
 | 
					            actions.setdefault(action, {})
 | 
				
			||||||
            for label, freq in labels.items():
 | 
					            for label, freq in labels.items():
 | 
				
			||||||
| 
						 | 
					@ -627,13 +420,8 @@ cdef class Parser:
 | 
				
			||||||
        if sgd is None:
 | 
					        if sgd is None:
 | 
				
			||||||
            sgd = self.create_optimizer()
 | 
					            sgd = self.create_optimizer()
 | 
				
			||||||
        doc_sample = []
 | 
					        doc_sample = []
 | 
				
			||||||
        gold_sample = []
 | 
					 | 
				
			||||||
        for example in islice(get_examples(), 10):
 | 
					        for example in islice(get_examples(), 10):
 | 
				
			||||||
            parses = example.get_gold_parses(merge=False, vocab=self.vocab)
 | 
					            doc_sample.append(example.predicted)
 | 
				
			||||||
            for doc, gold in parses:
 | 
					 | 
				
			||||||
                if len(doc):
 | 
					 | 
				
			||||||
                    doc_sample.append(doc)
 | 
					 | 
				
			||||||
                    gold_sample.append(gold)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if pipeline is not None:
 | 
					        if pipeline is not None:
 | 
				
			||||||
            for name, component in pipeline:
 | 
					            for name, component in pipeline:
 | 
				
			||||||
| 
						 | 
					@ -652,12 +440,6 @@ cdef class Parser:
 | 
				
			||||||
        link_vectors_to_models(self.vocab)
 | 
					        link_vectors_to_models(self.vocab)
 | 
				
			||||||
        return sgd
 | 
					        return sgd
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _get_doc(self, example):
 | 
					 | 
				
			||||||
        """ Use this method if the `example` can be both a Doc or an Example """
 | 
					 | 
				
			||||||
        if isinstance(example, Doc):
 | 
					 | 
				
			||||||
            return example
 | 
					 | 
				
			||||||
        return example.doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def to_disk(self, path, exclude=tuple(), **kwargs):
 | 
					    def to_disk(self, path, exclude=tuple(), **kwargs):
 | 
				
			||||||
        serializers = {
 | 
					        serializers = {
 | 
				
			||||||
            'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
 | 
					            'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
 | 
				
			||||||
| 
						 | 
					@ -714,3 +496,42 @@ cdef class Parser:
 | 
				
			||||||
                except AttributeError:
 | 
					                except AttributeError:
 | 
				
			||||||
                    raise ValueError(Errors.E149)
 | 
					                    raise ValueError(Errors.E149)
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _init_gold_batch(self, examples, min_length=5, max_length=500):
 | 
				
			||||||
 | 
					        """Make a square batch, of length equal to the shortest doc. A long
 | 
				
			||||||
 | 
					        doc will get multiple states. Let's say we have a doc of length 2*N,
 | 
				
			||||||
 | 
					        where N is the shortest doc. We'll make two states, one representing
 | 
				
			||||||
 | 
					        long_doc[:N], and another representing long_doc[N:]."""
 | 
				
			||||||
 | 
					        cdef:
 | 
				
			||||||
 | 
					            StateClass state
 | 
				
			||||||
 | 
					            Transition action
 | 
				
			||||||
 | 
					        all_states = self.moves.init_batch([eg.predicted for eg in examples])
 | 
				
			||||||
 | 
					        kept = []
 | 
				
			||||||
 | 
					        for state, eg in zip(all_states, examples):
 | 
				
			||||||
 | 
					            if self.moves.has_gold(eg) and not state.is_final():
 | 
				
			||||||
 | 
					                gold = self.moves.init_gold(state, eg)
 | 
				
			||||||
 | 
					                kept.append((eg, state, gold))
 | 
				
			||||||
 | 
					        max_length = max(min_length, min(max_length, min([len(eg.x) for eg in examples])))
 | 
				
			||||||
 | 
					        max_moves = 0
 | 
				
			||||||
 | 
					        states = []
 | 
				
			||||||
 | 
					        golds = []
 | 
				
			||||||
 | 
					        for eg, state, gold in kept:
 | 
				
			||||||
 | 
					            oracle_actions = self.moves.get_oracle_sequence(eg)
 | 
				
			||||||
 | 
					            start = 0
 | 
				
			||||||
 | 
					            while start < len(eg.predicted):
 | 
				
			||||||
 | 
					                state = state.copy()
 | 
				
			||||||
 | 
					                n_moves = 0
 | 
				
			||||||
 | 
					                while state.B(0) < start and not state.is_final():
 | 
				
			||||||
 | 
					                    action = self.moves.c[oracle_actions.pop(0)]
 | 
				
			||||||
 | 
					                    action.do(state.c, action.label)
 | 
				
			||||||
 | 
					                    state.c.push_hist(action.clas)
 | 
				
			||||||
 | 
					                    n_moves += 1
 | 
				
			||||||
 | 
					                has_gold = self.moves.has_gold(eg, start=start,
 | 
				
			||||||
 | 
					                                               end=start+max_length)
 | 
				
			||||||
 | 
					                if not state.is_final() and has_gold:
 | 
				
			||||||
 | 
					                    states.append(state)
 | 
				
			||||||
 | 
					                    golds.append(gold)
 | 
				
			||||||
 | 
					                    max_moves = max(max_moves, n_moves)
 | 
				
			||||||
 | 
					                start += min(max_length, len(eg.x)-start)
 | 
				
			||||||
 | 
					            max_moves = max(max_moves, len(oracle_actions))
 | 
				
			||||||
 | 
					        return states, golds, max_moves
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,7 +7,6 @@ from copy import copy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokens.doc cimport Doc, set_children_from_heads
 | 
					from ..tokens.doc cimport Doc, set_children_from_heads
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..gold import Example
 | 
					 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -51,7 +50,11 @@ def is_nonproj_arc(tokenid, heads):
 | 
				
			||||||
    elif head is None:  # unattached tokens cannot be non-projective
 | 
					    elif head is None:  # unattached tokens cannot be non-projective
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
 | 
					    cdef int start, end
 | 
				
			||||||
 | 
					    if head < tokenid:
 | 
				
			||||||
 | 
					        start, end = (head+1, tokenid)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        start, end = (tokenid+1, head)
 | 
				
			||||||
    for k in range(start, end):
 | 
					    for k in range(start, end):
 | 
				
			||||||
        for ancestor in ancestors(k, heads):
 | 
					        for ancestor in ancestors(k, heads):
 | 
				
			||||||
            if ancestor is None:  # for unattached tokens/subtrees
 | 
					            if ancestor is None:  # for unattached tokens/subtrees
 | 
				
			||||||
| 
						 | 
					@ -78,8 +81,8 @@ def is_decorated(label):
 | 
				
			||||||
def count_decorated_labels(gold_data):
 | 
					def count_decorated_labels(gold_data):
 | 
				
			||||||
    freqs = {}
 | 
					    freqs = {}
 | 
				
			||||||
    for example in gold_data:
 | 
					    for example in gold_data:
 | 
				
			||||||
        proj_heads, deco_deps = projectivize(example.token_annotation.heads,
 | 
					        proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
 | 
				
			||||||
                                             example.token_annotation.deps)
 | 
					                                             example.get_aligned("DEP"))
 | 
				
			||||||
        # set the label to ROOT for each root dependent
 | 
					        # set the label to ROOT for each root dependent
 | 
				
			||||||
        deco_deps = ['ROOT' if head == i else deco_deps[i]
 | 
					        deco_deps = ['ROOT' if head == i else deco_deps[i]
 | 
				
			||||||
                       for i, head in enumerate(proj_heads)]
 | 
					                       for i, head in enumerate(proj_heads)]
 | 
				
			||||||
| 
						 | 
					@ -90,31 +93,6 @@ def count_decorated_labels(gold_data):
 | 
				
			||||||
    return freqs
 | 
					    return freqs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def preprocess_training_data(gold_data, label_freq_cutoff=30):
 | 
					 | 
				
			||||||
    preprocessed = []
 | 
					 | 
				
			||||||
    freqs = {}
 | 
					 | 
				
			||||||
    for example in gold_data:
 | 
					 | 
				
			||||||
        new_example = Example(doc=example.doc)
 | 
					 | 
				
			||||||
        proj_heads, deco_deps = projectivize(example.token_annotation.heads,
 | 
					 | 
				
			||||||
                                             example.token_annotation.deps)
 | 
					 | 
				
			||||||
        # set the label to ROOT for each root dependent
 | 
					 | 
				
			||||||
        deco_deps = ['ROOT' if head == i else deco_deps[i]
 | 
					 | 
				
			||||||
                       for i, head in enumerate(proj_heads)]
 | 
					 | 
				
			||||||
        # count label frequencies
 | 
					 | 
				
			||||||
        if label_freq_cutoff > 0:
 | 
					 | 
				
			||||||
            for label in deco_deps:
 | 
					 | 
				
			||||||
                if is_decorated(label):
 | 
					 | 
				
			||||||
                    freqs[label] = freqs.get(label, 0) + 1
 | 
					 | 
				
			||||||
        proj_token_dict = example.token_annotation.to_dict()
 | 
					 | 
				
			||||||
        proj_token_dict["heads"] = proj_heads
 | 
					 | 
				
			||||||
        proj_token_dict["deps"] = deco_deps
 | 
					 | 
				
			||||||
        new_example.set_token_annotation(**proj_token_dict)
 | 
					 | 
				
			||||||
        preprocessed.append(new_example)
 | 
					 | 
				
			||||||
    if label_freq_cutoff > 0:
 | 
					 | 
				
			||||||
        return _filter_labels(preprocessed, label_freq_cutoff, freqs)
 | 
					 | 
				
			||||||
    return preprocessed
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def projectivize(heads, labels):
 | 
					def projectivize(heads, labels):
 | 
				
			||||||
    # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
 | 
					    # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
 | 
				
			||||||
    # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
 | 
					    # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
 | 
				
			||||||
| 
						 | 
					@ -200,22 +178,3 @@ def _find_new_head(token, headlabel):
 | 
				
			||||||
                next_queue.append(child)
 | 
					                next_queue.append(child)
 | 
				
			||||||
        queue = next_queue
 | 
					        queue = next_queue
 | 
				
			||||||
    return token.head
 | 
					    return token.head
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _filter_labels(examples, cutoff, freqs):
 | 
					 | 
				
			||||||
    # throw away infrequent decorated labels
 | 
					 | 
				
			||||||
    # can't learn them reliably anyway and keeps label set smaller
 | 
					 | 
				
			||||||
    filtered = []
 | 
					 | 
				
			||||||
    for example in examples:
 | 
					 | 
				
			||||||
        new_example = Example(doc=example.doc)
 | 
					 | 
				
			||||||
        filtered_labels = []
 | 
					 | 
				
			||||||
        for label in example.token_annotation.deps:
 | 
					 | 
				
			||||||
            if is_decorated(label) and freqs.get(label, 0) < cutoff:
 | 
					 | 
				
			||||||
                filtered_labels.append(decompose(label)[0])
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                filtered_labels.append(label)
 | 
					 | 
				
			||||||
        filtered_token_dict = example.token_annotation.to_dict()
 | 
					 | 
				
			||||||
        filtered_token_dict["deps"] = filtered_labels
 | 
					 | 
				
			||||||
        new_example.set_token_annotation(**filtered_token_dict)
 | 
					 | 
				
			||||||
        filtered.append(new_example)
 | 
					 | 
				
			||||||
    return filtered
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,11 +2,10 @@ from cymem.cymem cimport Pool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..typedefs cimport attr_t, weight_t
 | 
					from ..typedefs cimport attr_t, weight_t
 | 
				
			||||||
from ..structs cimport TokenC
 | 
					from ..structs cimport TokenC
 | 
				
			||||||
from ..gold cimport GoldParse
 | 
					 | 
				
			||||||
from ..gold cimport GoldParseC
 | 
					 | 
				
			||||||
from ..strings cimport StringStore
 | 
					from ..strings cimport StringStore
 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
from ._state cimport StateC
 | 
					from ._state cimport StateC
 | 
				
			||||||
 | 
					from ..gold.example cimport Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef struct Transition:
 | 
					cdef struct Transition:
 | 
				
			||||||
| 
						 | 
					@ -17,14 +16,14 @@ cdef struct Transition:
 | 
				
			||||||
    weight_t score
 | 
					    weight_t score
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    bint (*is_valid)(const StateC* state, attr_t label) nogil
 | 
					    bint (*is_valid)(const StateC* state, attr_t label) nogil
 | 
				
			||||||
    weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
 | 
					    weight_t (*get_cost)(StateClass state, const void* gold, attr_t label) nogil
 | 
				
			||||||
    int (*do)(StateC* state, attr_t label) nogil
 | 
					    int (*do)(StateC* state, attr_t label) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
 | 
					ctypedef weight_t (*get_cost_func_t)(StateClass state, const void* gold,
 | 
				
			||||||
        attr_tlabel) nogil
 | 
					        attr_tlabel) nogil
 | 
				
			||||||
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
 | 
					ctypedef weight_t (*move_cost_func_t)(StateClass state, const void* gold) nogil
 | 
				
			||||||
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
 | 
					ctypedef weight_t (*label_cost_func_t)(StateClass state, const void*
 | 
				
			||||||
        gold, attr_t label) nogil
 | 
					        gold, attr_t label) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
 | 
					ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
 | 
				
			||||||
| 
						 | 
					@ -41,8 +40,6 @@ cdef class TransitionSystem:
 | 
				
			||||||
    cdef int _size
 | 
					    cdef int _size
 | 
				
			||||||
    cdef public attr_t root_label
 | 
					    cdef public attr_t root_label
 | 
				
			||||||
    cdef public freqs
 | 
					    cdef public freqs
 | 
				
			||||||
    cdef init_state_t init_beam_state
 | 
					 | 
				
			||||||
    cdef del_state_t del_beam_state
 | 
					 | 
				
			||||||
    cdef public object labels
 | 
					    cdef public object labels
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int initialize_state(self, StateC* state) nogil
 | 
					    cdef int initialize_state(self, StateC* state) nogil
 | 
				
			||||||
| 
						 | 
					@ -55,4 +52,4 @@ cdef class TransitionSystem:
 | 
				
			||||||
    cdef int set_valid(self, int* output, const StateC* st) nogil
 | 
					    cdef int set_valid(self, int* output, const StateC* st) nogil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int set_costs(self, int* is_valid, weight_t* costs,
 | 
					    cdef int set_costs(self, int* is_valid, weight_t* costs,
 | 
				
			||||||
                       StateClass state, GoldParse gold) except -1
 | 
					                       StateClass state, gold) except -1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,12 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
 | 
					from __future__ import print_function
 | 
				
			||||||
from cpython.ref cimport Py_INCREF
 | 
					from cpython.ref cimport Py_INCREF
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from thinc.extra.search cimport Beam
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from collections import Counter
 | 
					from collections import Counter
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..typedefs cimport weight_t
 | 
					from ..typedefs cimport weight_t
 | 
				
			||||||
from . cimport _beam_utils
 | 
					 | 
				
			||||||
from ..tokens.doc cimport Doc
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
from ..structs cimport TokenC
 | 
					from ..structs cimport TokenC
 | 
				
			||||||
from .stateclass cimport StateClass
 | 
					from .stateclass cimport StateClass
 | 
				
			||||||
| 
						 | 
					@ -47,8 +46,6 @@ cdef class TransitionSystem:
 | 
				
			||||||
        if labels_by_action:
 | 
					        if labels_by_action:
 | 
				
			||||||
            self.initialize_actions(labels_by_action, min_freq=min_freq)
 | 
					            self.initialize_actions(labels_by_action, min_freq=min_freq)
 | 
				
			||||||
        self.root_label = self.strings.add('ROOT')
 | 
					        self.root_label = self.strings.add('ROOT')
 | 
				
			||||||
        self.init_beam_state = _init_state
 | 
					 | 
				
			||||||
        self.del_beam_state = _del_state
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __reduce__(self):
 | 
					    def __reduce__(self):
 | 
				
			||||||
        return (self.__class__, (self.strings, self.labels), None, None)
 | 
					        return (self.__class__, (self.strings, self.labels), None, None)
 | 
				
			||||||
| 
						 | 
					@ -64,48 +61,55 @@ cdef class TransitionSystem:
 | 
				
			||||||
            offset += len(doc)
 | 
					            offset += len(doc)
 | 
				
			||||||
        return states
 | 
					        return states
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def init_beams(self, docs, beam_width, beam_density=0.):
 | 
					    def get_oracle_sequence(self, Example example, _debug=False):
 | 
				
			||||||
        cdef Doc doc
 | 
					 | 
				
			||||||
        beams = []
 | 
					 | 
				
			||||||
        cdef int offset = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Doc objects might contain labels that we need to register actions for. We need to check for that
 | 
					 | 
				
			||||||
        # *before* we create any Beam objects, because the Beam object needs the correct number of
 | 
					 | 
				
			||||||
        # actions. It's sort of dumb, but the best way is to just call init_batch() -- that triggers the additions,
 | 
					 | 
				
			||||||
        # and it doesn't matter that we create and discard the state objects.
 | 
					 | 
				
			||||||
        self.init_batch(docs)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        for doc in docs:
 | 
					 | 
				
			||||||
            beam = Beam(self.n_moves, beam_width, min_density=beam_density)
 | 
					 | 
				
			||||||
            beam.initialize(self.init_beam_state, self.del_beam_state,
 | 
					 | 
				
			||||||
                            doc.length, doc.c)
 | 
					 | 
				
			||||||
            for i in range(beam.width):
 | 
					 | 
				
			||||||
                state = <StateC*>beam.at(i)
 | 
					 | 
				
			||||||
                state.offset = offset
 | 
					 | 
				
			||||||
            offset += len(doc)
 | 
					 | 
				
			||||||
            beam.check_done(_beam_utils.check_final_state, NULL)
 | 
					 | 
				
			||||||
            beams.append(beam)
 | 
					 | 
				
			||||||
        return beams
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_oracle_sequence(self, doc, GoldParse gold):
 | 
					 | 
				
			||||||
        cdef Pool mem = Pool()
 | 
					        cdef Pool mem = Pool()
 | 
				
			||||||
        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
 | 
					        # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
 | 
				
			||||||
        assert self.n_moves > 0
 | 
					        assert self.n_moves > 0
 | 
				
			||||||
        costs = <float*>mem.alloc(self.n_moves, sizeof(float))
 | 
					        costs = <float*>mem.alloc(self.n_moves, sizeof(float))
 | 
				
			||||||
        is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
 | 
					        is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef StateClass state = StateClass(doc, offset=0)
 | 
					        cdef StateClass state
 | 
				
			||||||
        self.initialize_state(state.c)
 | 
					        states, golds, n_steps = self.init_gold_batch([example])
 | 
				
			||||||
 | 
					        if not states:
 | 
				
			||||||
 | 
					            return []
 | 
				
			||||||
 | 
					        state = states[0]
 | 
				
			||||||
 | 
					        gold = golds[0]
 | 
				
			||||||
        history = []
 | 
					        history = []
 | 
				
			||||||
 | 
					        debug_log = []
 | 
				
			||||||
        while not state.is_final():
 | 
					        while not state.is_final():
 | 
				
			||||||
            self.set_costs(is_valid, costs, state, gold)
 | 
					            self.set_costs(is_valid, costs, state, gold)
 | 
				
			||||||
            for i in range(self.n_moves):
 | 
					            for i in range(self.n_moves):
 | 
				
			||||||
                if is_valid[i] and costs[i] <= 0:
 | 
					                if is_valid[i] and costs[i] <= 0:
 | 
				
			||||||
                    action = self.c[i]
 | 
					                    action = self.c[i]
 | 
				
			||||||
                    history.append(i)
 | 
					                    history.append(i)
 | 
				
			||||||
 | 
					                    s0 = state.S(0)
 | 
				
			||||||
 | 
					                    b0 = state.B(0)
 | 
				
			||||||
 | 
					                    if _debug:
 | 
				
			||||||
 | 
					                        debug_log.append(" ".join((
 | 
				
			||||||
 | 
					                            self.get_class_name(i),
 | 
				
			||||||
 | 
					                            "S0=", (example.x[s0].text if s0 >= 0 else "__"),
 | 
				
			||||||
 | 
					                            "B0=", (example.x[b0].text if b0 >= 0 else "__"),
 | 
				
			||||||
 | 
					                            "S0 head?", str(state.has_head(state.S(0))),
 | 
				
			||||||
 | 
					                        )))
 | 
				
			||||||
                    action.do(state.c, action.label)
 | 
					                    action.do(state.c, action.label)
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
 | 
					                if _debug:
 | 
				
			||||||
 | 
					                    print("Actions")
 | 
				
			||||||
 | 
					                    for i in range(self.n_moves):
 | 
				
			||||||
 | 
					                        print(self.get_class_name(i))
 | 
				
			||||||
 | 
					                    print("Gold")
 | 
				
			||||||
 | 
					                    for token in example.y:
 | 
				
			||||||
 | 
					                        print(token.text, token.dep_, token.head.text)
 | 
				
			||||||
 | 
					                    s0 = state.S(0)
 | 
				
			||||||
 | 
					                    b0 = state.B(0)
 | 
				
			||||||
 | 
					                    debug_log.append(" ".join((
 | 
				
			||||||
 | 
					                        "?",
 | 
				
			||||||
 | 
					                        "S0=", (example.x[s0].text if s0 >= 0 else "-"),
 | 
				
			||||||
 | 
					                        "B0=", (example.x[b0].text if b0 >= 0 else "-"),
 | 
				
			||||||
 | 
					                        "S0 head?", str(state.has_head(state.S(0))),
 | 
				
			||||||
 | 
					                    )))
 | 
				
			||||||
 | 
					                    print("\n".join(debug_log))
 | 
				
			||||||
                raise ValueError(Errors.E024)
 | 
					                raise ValueError(Errors.E024)
 | 
				
			||||||
        return history
 | 
					        return history
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -124,12 +128,6 @@ cdef class TransitionSystem:
 | 
				
			||||||
    def finalize_doc(self, doc):
 | 
					    def finalize_doc(self, doc):
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def preprocess_gold(self, GoldParse gold):
 | 
					 | 
				
			||||||
        raise NotImplementedError
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def is_gold_parse(self, StateClass state, GoldParse gold):
 | 
					 | 
				
			||||||
        raise NotImplementedError
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef Transition lookup_transition(self, object name) except *:
 | 
					    cdef Transition lookup_transition(self, object name) except *:
 | 
				
			||||||
        raise NotImplementedError
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -148,18 +146,8 @@ cdef class TransitionSystem:
 | 
				
			||||||
            is_valid[i] = self.c[i].is_valid(st, self.c[i].label)
 | 
					            is_valid[i] = self.c[i].is_valid(st, self.c[i].label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int set_costs(self, int* is_valid, weight_t* costs,
 | 
					    cdef int set_costs(self, int* is_valid, weight_t* costs,
 | 
				
			||||||
                       StateClass stcls, GoldParse gold) except -1:
 | 
					                       StateClass stcls, gold) except -1:
 | 
				
			||||||
        cdef int i
 | 
					        raise NotImplementedError
 | 
				
			||||||
        self.set_valid(is_valid, stcls.c)
 | 
					 | 
				
			||||||
        cdef int n_gold = 0
 | 
					 | 
				
			||||||
        for i in range(self.n_moves):
 | 
					 | 
				
			||||||
            if is_valid[i]:
 | 
					 | 
				
			||||||
                costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
 | 
					 | 
				
			||||||
                n_gold += costs[i] <= 0
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                costs[i] = 9000
 | 
					 | 
				
			||||||
        if n_gold <= 0:
 | 
					 | 
				
			||||||
            raise ValueError(Errors.E024)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_class_name(self, int clas):
 | 
					    def get_class_name(self, int clas):
 | 
				
			||||||
        act = self.c[clas]
 | 
					        act = self.c[clas]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
from spacy.attrs import ORTH, SHAPE, POS, DEP
 | 
					from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..util import get_doc
 | 
					from ..util import get_doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -44,6 +44,20 @@ def test_doc_array_tag(en_vocab):
 | 
				
			||||||
    assert feats_array[3][1] == doc[3].pos
 | 
					    assert feats_array[3][1] == doc[3].pos
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_doc_array_morph(en_vocab):
 | 
				
			||||||
 | 
					    words = ["Eat", "blue", "ham"]
 | 
				
			||||||
 | 
					    morph = ["Feat=V", "Feat=J", "Feat=N"]
 | 
				
			||||||
 | 
					    doc = get_doc(en_vocab, words=words, morphs=morph)
 | 
				
			||||||
 | 
					    assert morph[0] == doc[0].morph_
 | 
				
			||||||
 | 
					    assert morph[1] == doc[1].morph_
 | 
				
			||||||
 | 
					    assert morph[2] == doc[2].morph_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    feats_array = doc.to_array((ORTH, MORPH))
 | 
				
			||||||
 | 
					    assert feats_array[0][1] == doc[0].morph.key
 | 
				
			||||||
 | 
					    assert feats_array[1][1] == doc[1].morph.key
 | 
				
			||||||
 | 
					    assert feats_array[2][1] == doc[2].morph.key
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_doc_array_dep(en_vocab):
 | 
					def test_doc_array_dep(en_vocab):
 | 
				
			||||||
    words = ["A", "nice", "sentence", "."]
 | 
					    words = ["A", "nice", "sentence", "."]
 | 
				
			||||||
    deps = ["det", "amod", "ROOT", "punct"]
 | 
					    deps = ["det", "amod", "ROOT", "punct"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,9 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from thinc.api import Adam
 | 
					from thinc.api import Adam
 | 
				
			||||||
from spacy.attrs import NORM
 | 
					from spacy.attrs import NORM
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.gold import Example
 | 
				
			||||||
from spacy.pipeline.defaults import default_parser, default_ner
 | 
					from spacy.pipeline.defaults import default_parser, default_ner
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
from spacy.pipeline import DependencyParser, EntityRecognizer
 | 
					from spacy.pipeline import DependencyParser, EntityRecognizer
 | 
				
			||||||
| 
						 | 
					@ -39,8 +40,9 @@ def _train_parser(parser):
 | 
				
			||||||
    for i in range(5):
 | 
					    for i in range(5):
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
 | 
					        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
 | 
				
			||||||
        gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
 | 
					        gold = {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
 | 
				
			||||||
        parser.update((doc, gold), sgd=sgd, losses=losses)
 | 
					        example = Example.from_dict(doc, gold)
 | 
				
			||||||
 | 
					        parser.update([example], sgd=sgd, losses=losses)
 | 
				
			||||||
    return parser
 | 
					    return parser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -51,10 +53,9 @@ def test_add_label(parser):
 | 
				
			||||||
    for i in range(100):
 | 
					    for i in range(100):
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
 | 
					        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
 | 
				
			||||||
        gold = GoldParse(
 | 
					        gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
 | 
				
			||||||
            doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]
 | 
					        example = Example.from_dict(doc, gold)
 | 
				
			||||||
        )
 | 
					        parser.update([example], sgd=sgd, losses=losses)
 | 
				
			||||||
        parser.update((doc, gold), sgd=sgd, losses=losses)
 | 
					 | 
				
			||||||
    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
 | 
					    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
 | 
				
			||||||
    doc = parser(doc)
 | 
					    doc = parser(doc)
 | 
				
			||||||
    assert doc[0].dep_ == "right"
 | 
					    assert doc[0].dep_ == "right"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,22 +1,23 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.gold import Example
 | 
				
			||||||
from spacy.pipeline.defaults import default_parser
 | 
					from spacy.pipeline.defaults import default_parser
 | 
				
			||||||
from spacy.pipeline import DependencyParser
 | 
					from spacy.pipeline import DependencyParser
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
from spacy.syntax.nonproj import projectivize
 | 
					from spacy.syntax.nonproj import projectivize
 | 
				
			||||||
from spacy.syntax.stateclass import StateClass
 | 
					 | 
				
			||||||
from spacy.syntax.arc_eager import ArcEager
 | 
					from spacy.syntax.arc_eager import ArcEager
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_sequence_costs(M, words, heads, deps, transitions):
 | 
					def get_sequence_costs(M, words, heads, deps, transitions):
 | 
				
			||||||
    doc = Doc(Vocab(), words=words)
 | 
					    doc = Doc(Vocab(), words=words)
 | 
				
			||||||
    gold = GoldParse(doc, heads=heads, deps=deps)
 | 
					    example = Example.from_dict(doc, {"heads": heads, "deps": deps})
 | 
				
			||||||
    state = StateClass(doc)
 | 
					    states, golds, _ = M.init_gold_batch([example])
 | 
				
			||||||
    M.preprocess_gold(gold)
 | 
					    state = states[0]
 | 
				
			||||||
 | 
					    gold = golds[0]
 | 
				
			||||||
    cost_history = []
 | 
					    cost_history = []
 | 
				
			||||||
    for gold_action in transitions:
 | 
					    for gold_action in transitions:
 | 
				
			||||||
 | 
					        gold.update(state)
 | 
				
			||||||
        state_costs = {}
 | 
					        state_costs = {}
 | 
				
			||||||
        for i in range(M.n_moves):
 | 
					        for i in range(M.n_moves):
 | 
				
			||||||
            name = M.class_name(i)
 | 
					            name = M.class_name(i)
 | 
				
			||||||
| 
						 | 
					@ -39,31 +40,13 @@ def arc_eager(vocab):
 | 
				
			||||||
    return moves
 | 
					    return moves
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def words():
 | 
					 | 
				
			||||||
    return ["a", "b"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def doc(words, vocab):
 | 
					 | 
				
			||||||
    if vocab is None:
 | 
					 | 
				
			||||||
        vocab = Vocab()
 | 
					 | 
				
			||||||
    return Doc(vocab, words=list(words))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def gold(doc, words):
 | 
					 | 
				
			||||||
    if len(words) == 2:
 | 
					 | 
				
			||||||
        return GoldParse(doc, words=["a", "b"], heads=[0, 0], deps=["ROOT", "right"])
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        raise NotImplementedError
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.xfail
 | 
					 | 
				
			||||||
def test_oracle_four_words(arc_eager, vocab):
 | 
					def test_oracle_four_words(arc_eager, vocab):
 | 
				
			||||||
    words = ["a", "b", "c", "d"]
 | 
					    words = ["a", "b", "c", "d"]
 | 
				
			||||||
    heads = [1, 1, 3, 3]
 | 
					    heads = [1, 1, 3, 3]
 | 
				
			||||||
    deps = ["left", "ROOT", "left", "ROOT"]
 | 
					    deps = ["left", "ROOT", "left", "ROOT"]
 | 
				
			||||||
 | 
					    for dep in deps:
 | 
				
			||||||
 | 
					        arc_eager.add_action(2, dep)  # Left
 | 
				
			||||||
 | 
					        arc_eager.add_action(3, dep)  # Right
 | 
				
			||||||
    actions = ["L-left", "B-ROOT", "L-left"]
 | 
					    actions = ["L-left", "B-ROOT", "L-left"]
 | 
				
			||||||
    state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions)
 | 
					    state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions)
 | 
				
			||||||
    assert state.is_final()
 | 
					    assert state.is_final()
 | 
				
			||||||
| 
						 | 
					@ -72,7 +55,7 @@ def test_oracle_four_words(arc_eager, vocab):
 | 
				
			||||||
        assert state_costs[actions[i]] == 0.0, actions[i]
 | 
					        assert state_costs[actions[i]] == 0.0, actions[i]
 | 
				
			||||||
        for other_action, cost in state_costs.items():
 | 
					        for other_action, cost in state_costs.items():
 | 
				
			||||||
            if other_action != actions[i]:
 | 
					            if other_action != actions[i]:
 | 
				
			||||||
                assert cost >= 1
 | 
					                assert cost >= 1, (i, other_action)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
annot_tuples = [
 | 
					annot_tuples = [
 | 
				
			||||||
| 
						 | 
					@ -140,7 +123,7 @@ def test_get_oracle_actions():
 | 
				
			||||||
    doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
 | 
					    doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
 | 
				
			||||||
    config = {
 | 
					    config = {
 | 
				
			||||||
        "learn_tokens": False,
 | 
					        "learn_tokens": False,
 | 
				
			||||||
        "min_action_freq": 30,
 | 
					        "min_action_freq": 0,
 | 
				
			||||||
        "beam_width": 1,
 | 
					        "beam_width": 1,
 | 
				
			||||||
        "beam_update_prob": 1.0,
 | 
					        "beam_update_prob": 1.0,
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
| 
						 | 
					@ -149,12 +132,98 @@ def test_get_oracle_actions():
 | 
				
			||||||
    parser.moves.add_action(1, "")
 | 
					    parser.moves.add_action(1, "")
 | 
				
			||||||
    parser.moves.add_action(1, "")
 | 
					    parser.moves.add_action(1, "")
 | 
				
			||||||
    parser.moves.add_action(4, "ROOT")
 | 
					    parser.moves.add_action(4, "ROOT")
 | 
				
			||||||
 | 
					    heads, deps = projectivize(heads, deps)
 | 
				
			||||||
    for i, (head, dep) in enumerate(zip(heads, deps)):
 | 
					    for i, (head, dep) in enumerate(zip(heads, deps)):
 | 
				
			||||||
        if head > i:
 | 
					        if head > i:
 | 
				
			||||||
            parser.moves.add_action(2, dep)
 | 
					            parser.moves.add_action(2, dep)
 | 
				
			||||||
        elif head < i:
 | 
					        elif head < i:
 | 
				
			||||||
            parser.moves.add_action(3, dep)
 | 
					            parser.moves.add_action(3, dep)
 | 
				
			||||||
    heads, deps = projectivize(heads, deps)
 | 
					    example = Example.from_dict(
 | 
				
			||||||
    gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
 | 
					        doc, {"words": words, "tags": tags, "heads": heads, "deps": deps}
 | 
				
			||||||
    parser.moves.preprocess_gold(gold)
 | 
					    )
 | 
				
			||||||
    parser.moves.get_oracle_sequence(doc, gold)
 | 
					    parser.moves.get_oracle_sequence(example)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_oracle_dev_sentence(vocab, arc_eager):
 | 
				
			||||||
 | 
					    words_deps_heads = """
 | 
				
			||||||
 | 
					        Rolls-Royce nn Inc.
 | 
				
			||||||
 | 
					        Motor nn Inc.
 | 
				
			||||||
 | 
					        Cars nn Inc.
 | 
				
			||||||
 | 
					        Inc. nsubj said
 | 
				
			||||||
 | 
					        said ROOT said
 | 
				
			||||||
 | 
					        it nsubj expects
 | 
				
			||||||
 | 
					        expects ccomp said
 | 
				
			||||||
 | 
					        its poss sales
 | 
				
			||||||
 | 
					        U.S. nn sales
 | 
				
			||||||
 | 
					        sales nsubj steady
 | 
				
			||||||
 | 
					        to aux steady
 | 
				
			||||||
 | 
					        remain cop steady
 | 
				
			||||||
 | 
					        steady xcomp expects
 | 
				
			||||||
 | 
					        at prep steady
 | 
				
			||||||
 | 
					        about quantmod 1,200
 | 
				
			||||||
 | 
					        1,200 num cars
 | 
				
			||||||
 | 
					        cars pobj at
 | 
				
			||||||
 | 
					        in prep steady
 | 
				
			||||||
 | 
					        1990 pobj in
 | 
				
			||||||
 | 
					        . punct said
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    expected_transitions = [
 | 
				
			||||||
 | 
					        "S",  # Shift 'Motor'
 | 
				
			||||||
 | 
					        "S",  # Shift 'Cars'
 | 
				
			||||||
 | 
					        "L-nn",  # Attach 'Cars' to 'Inc.'
 | 
				
			||||||
 | 
					        "L-nn",  # Attach 'Motor' to 'Inc.'
 | 
				
			||||||
 | 
					        "L-nn",  # Attach 'Rolls-Royce' to 'Inc.', force shift
 | 
				
			||||||
 | 
					        "L-nsubj",  # Attach 'Inc.' to 'said'
 | 
				
			||||||
 | 
					        "S",  # Shift 'it'
 | 
				
			||||||
 | 
					        "L-nsubj",  # Attach 'it.' to 'expects'
 | 
				
			||||||
 | 
					        "R-ccomp",  # Attach 'expects' to 'said'
 | 
				
			||||||
 | 
					        "S",  # Shift 'its'
 | 
				
			||||||
 | 
					        "S",  # Shift 'U.S.'
 | 
				
			||||||
 | 
					        "L-nn",  # Attach 'U.S.' to 'sales'
 | 
				
			||||||
 | 
					        "L-poss",  # Attach 'its' to 'sales'
 | 
				
			||||||
 | 
					        "S",  # Shift 'sales'
 | 
				
			||||||
 | 
					        "S",  # Shift 'to'
 | 
				
			||||||
 | 
					        "S",  # Shift 'remain'
 | 
				
			||||||
 | 
					        "L-cop",  # Attach 'remain' to 'steady'
 | 
				
			||||||
 | 
					        "L-aux",  # Attach 'to' to 'steady'
 | 
				
			||||||
 | 
					        "L-nsubj",  # Attach 'sales' to 'steady'
 | 
				
			||||||
 | 
					        "R-xcomp",  # Attach 'steady' to 'expects'
 | 
				
			||||||
 | 
					        "R-prep",  # Attach 'at' to 'steady'
 | 
				
			||||||
 | 
					        "S",  # Shift 'about'
 | 
				
			||||||
 | 
					        "L-quantmod",  # Attach "about" to "1,200"
 | 
				
			||||||
 | 
					        "S",  # Shift "1,200"
 | 
				
			||||||
 | 
					        "L-num",  # Attach "1,200" to "cars"
 | 
				
			||||||
 | 
					        "R-pobj",  # Attach "cars" to "at"
 | 
				
			||||||
 | 
					        "D",  # Reduce "cars"
 | 
				
			||||||
 | 
					        "D",  # Reduce "at"
 | 
				
			||||||
 | 
					        "R-prep",  # Attach "in" to "steady"
 | 
				
			||||||
 | 
					        "R-pobj",  # Attach "1990" to "in"
 | 
				
			||||||
 | 
					        "D",  # Reduce "1990"
 | 
				
			||||||
 | 
					        "D",  # Reduce "in"
 | 
				
			||||||
 | 
					        "D",  # Reduce "steady"
 | 
				
			||||||
 | 
					        "D",  # Reduce "expects"
 | 
				
			||||||
 | 
					        "R-punct",  # Attach "." to "said"
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    gold_words = []
 | 
				
			||||||
 | 
					    gold_deps = []
 | 
				
			||||||
 | 
					    gold_heads = []
 | 
				
			||||||
 | 
					    for line in words_deps_heads.strip().split("\n"):
 | 
				
			||||||
 | 
					        line = line.strip()
 | 
				
			||||||
 | 
					        if not line:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        word, dep, head = line.split()
 | 
				
			||||||
 | 
					        gold_words.append(word)
 | 
				
			||||||
 | 
					        gold_deps.append(dep)
 | 
				
			||||||
 | 
					        gold_heads.append(head)
 | 
				
			||||||
 | 
					    gold_heads = [gold_words.index(head) for head in gold_heads]
 | 
				
			||||||
 | 
					    for dep in gold_deps:
 | 
				
			||||||
 | 
					        arc_eager.add_action(2, dep)  # Left
 | 
				
			||||||
 | 
					        arc_eager.add_action(3, dep)  # Right
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc = Doc(Vocab(), words=gold_words)
 | 
				
			||||||
 | 
					    example = Example.from_dict(doc, {"heads": gold_heads, "deps": gold_deps})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ae_oracle_actions = arc_eager.get_oracle_sequence(example)
 | 
				
			||||||
 | 
					    ae_oracle_actions = [arc_eager.get_class_name(i) for i in ae_oracle_actions]
 | 
				
			||||||
 | 
					    assert ae_oracle_actions == expected_transitions
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,6 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					from spacy.attrs import ENT_IOB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import util
 | 
					from spacy import util
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,12 +10,11 @@ from spacy.pipeline.defaults import default_ner
 | 
				
			||||||
from spacy.pipeline import EntityRecognizer, EntityRuler
 | 
					from spacy.pipeline import EntityRecognizer, EntityRuler
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
from spacy.syntax.ner import BiluoPushDown
 | 
					from spacy.syntax.ner import BiluoPushDown
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					from spacy.gold import Example
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..util import make_tempdir
 | 
					from ..util import make_tempdir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
TRAIN_DATA = [
 | 
					TRAIN_DATA = [
 | 
				
			||||||
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
 | 
					    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
 | 
				
			||||||
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
 | 
					    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
 | 
				
			||||||
| 
						 | 
					@ -52,51 +53,55 @@ def tsys(vocab, entity_types):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_get_oracle_moves(tsys, doc, entity_annots):
 | 
					def test_get_oracle_moves(tsys, doc, entity_annots):
 | 
				
			||||||
    gold = GoldParse(doc, entities=entity_annots)
 | 
					    example = Example.from_dict(doc, {"entities": entity_annots})
 | 
				
			||||||
    tsys.preprocess_gold(gold)
 | 
					    act_classes = tsys.get_oracle_sequence(example)
 | 
				
			||||||
    act_classes = tsys.get_oracle_sequence(doc, gold)
 | 
					 | 
				
			||||||
    names = [tsys.get_class_name(act) for act in act_classes]
 | 
					    names = [tsys.get_class_name(act) for act in act_classes]
 | 
				
			||||||
    assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
 | 
					    assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
 | 
					def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
 | 
				
			||||||
    entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
 | 
					    entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
 | 
				
			||||||
    gold = GoldParse(doc, entities=entity_annots)
 | 
					    example = Example.from_dict(doc, {"entities": entity_annots})
 | 
				
			||||||
    for i, tag in enumerate(gold.ner):
 | 
					    ex_dict = example.to_dict()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for i, tag in enumerate(ex_dict["doc_annotation"]["entities"]):
 | 
				
			||||||
        if tag == "L-!GPE":
 | 
					        if tag == "L-!GPE":
 | 
				
			||||||
            gold.ner[i] = "-"
 | 
					            ex_dict["doc_annotation"]["entities"][i] = "-"
 | 
				
			||||||
    tsys.preprocess_gold(gold)
 | 
					    example = Example.from_dict(doc, ex_dict)
 | 
				
			||||||
    act_classes = tsys.get_oracle_sequence(doc, gold)
 | 
					
 | 
				
			||||||
 | 
					    act_classes = tsys.get_oracle_sequence(example)
 | 
				
			||||||
    names = [tsys.get_class_name(act) for act in act_classes]
 | 
					    names = [tsys.get_class_name(act) for act in act_classes]
 | 
				
			||||||
    assert names
 | 
					    assert names
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_get_oracle_moves_negative_entities2(tsys, vocab):
 | 
					def test_get_oracle_moves_negative_entities2(tsys, vocab):
 | 
				
			||||||
    doc = Doc(vocab, words=["A", "B", "C", "D"])
 | 
					    doc = Doc(vocab, words=["A", "B", "C", "D"])
 | 
				
			||||||
    gold = GoldParse(doc, entities=[])
 | 
					    entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
 | 
				
			||||||
    gold.ner = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
 | 
					    example = Example.from_dict(doc, {"entities": entity_annots})
 | 
				
			||||||
    tsys.preprocess_gold(gold)
 | 
					    act_classes = tsys.get_oracle_sequence(example)
 | 
				
			||||||
    act_classes = tsys.get_oracle_sequence(doc, gold)
 | 
					 | 
				
			||||||
    names = [tsys.get_class_name(act) for act in act_classes]
 | 
					    names = [tsys.get_class_name(act) for act in act_classes]
 | 
				
			||||||
    assert names
 | 
					    assert names
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.xfail(reason="Maybe outdated? Unsure")
 | 
				
			||||||
def test_get_oracle_moves_negative_O(tsys, vocab):
 | 
					def test_get_oracle_moves_negative_O(tsys, vocab):
 | 
				
			||||||
    doc = Doc(vocab, words=["A", "B", "C", "D"])
 | 
					    doc = Doc(vocab, words=["A", "B", "C", "D"])
 | 
				
			||||||
    gold = GoldParse(doc, entities=[])
 | 
					    entity_annots = ["O", "!O", "O", "!O"]
 | 
				
			||||||
    gold.ner = ["O", "!O", "O", "!O"]
 | 
					    example = Example.from_dict(doc, {"entities": entity_annots})
 | 
				
			||||||
    tsys.preprocess_gold(gold)
 | 
					    act_classes = tsys.get_oracle_sequence(example)
 | 
				
			||||||
    act_classes = tsys.get_oracle_sequence(doc, gold)
 | 
					 | 
				
			||||||
    names = [tsys.get_class_name(act) for act in act_classes]
 | 
					    names = [tsys.get_class_name(act) for act in act_classes]
 | 
				
			||||||
    assert names
 | 
					    assert names
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# We can't easily represent this on a Doc object. Not sure what the best solution
 | 
				
			||||||
 | 
					# would be, but I don't think it's an important use case?
 | 
				
			||||||
 | 
					@pytest.mark.xfail(reason="No longer supported")
 | 
				
			||||||
def test_oracle_moves_missing_B(en_vocab):
 | 
					def test_oracle_moves_missing_B(en_vocab):
 | 
				
			||||||
    words = ["B", "52", "Bomber"]
 | 
					    words = ["B", "52", "Bomber"]
 | 
				
			||||||
    biluo_tags = [None, None, "L-PRODUCT"]
 | 
					    biluo_tags = [None, None, "L-PRODUCT"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    doc = Doc(en_vocab, words=words)
 | 
					    doc = Doc(en_vocab, words=words)
 | 
				
			||||||
    gold = GoldParse(doc, words=words, entities=biluo_tags)
 | 
					    example = Example.from_dict(doc, {"words": words, "entities": biluo_tags})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    moves = BiluoPushDown(en_vocab.strings)
 | 
					    moves = BiluoPushDown(en_vocab.strings)
 | 
				
			||||||
    move_types = ("M", "B", "I", "L", "U", "O")
 | 
					    move_types = ("M", "B", "I", "L", "U", "O")
 | 
				
			||||||
| 
						 | 
					@ -111,16 +116,17 @@ def test_oracle_moves_missing_B(en_vocab):
 | 
				
			||||||
            moves.add_action(move_types.index("I"), label)
 | 
					            moves.add_action(move_types.index("I"), label)
 | 
				
			||||||
            moves.add_action(move_types.index("L"), label)
 | 
					            moves.add_action(move_types.index("L"), label)
 | 
				
			||||||
            moves.add_action(move_types.index("U"), label)
 | 
					            moves.add_action(move_types.index("U"), label)
 | 
				
			||||||
    moves.preprocess_gold(gold)
 | 
					    moves.get_oracle_sequence(example)
 | 
				
			||||||
    moves.get_oracle_sequence(doc, gold)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# We can't easily represent this on a Doc object. Not sure what the best solution
 | 
				
			||||||
 | 
					# would be, but I don't think it's an important use case?
 | 
				
			||||||
 | 
					@pytest.mark.xfail(reason="No longer supported")
 | 
				
			||||||
def test_oracle_moves_whitespace(en_vocab):
 | 
					def test_oracle_moves_whitespace(en_vocab):
 | 
				
			||||||
    words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
 | 
					    words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
 | 
				
			||||||
    biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
 | 
					    biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    doc = Doc(en_vocab, words=words)
 | 
					    doc = Doc(en_vocab, words=words)
 | 
				
			||||||
    gold = GoldParse(doc, words=words, entities=biluo_tags)
 | 
					    example = Example.from_dict(doc, {"entities": biluo_tags})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    moves = BiluoPushDown(en_vocab.strings)
 | 
					    moves = BiluoPushDown(en_vocab.strings)
 | 
				
			||||||
    move_types = ("M", "B", "I", "L", "U", "O")
 | 
					    move_types = ("M", "B", "I", "L", "U", "O")
 | 
				
			||||||
| 
						 | 
					@ -132,8 +138,7 @@ def test_oracle_moves_whitespace(en_vocab):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            action, label = tag.split("-")
 | 
					            action, label = tag.split("-")
 | 
				
			||||||
            moves.add_action(move_types.index(action), label)
 | 
					            moves.add_action(move_types.index(action), label)
 | 
				
			||||||
    moves.preprocess_gold(gold)
 | 
					    moves.get_oracle_sequence(example)
 | 
				
			||||||
    moves.get_oracle_sequence(doc, gold)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_accept_blocked_token():
 | 
					def test_accept_blocked_token():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,10 +1,11 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.gold import Example
 | 
				
			||||||
from spacy.pipeline.defaults import default_parser, default_tok2vec
 | 
					from spacy.pipeline.defaults import default_parser, default_tok2vec
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
from spacy.syntax.arc_eager import ArcEager
 | 
					from spacy.syntax.arc_eager import ArcEager
 | 
				
			||||||
from spacy.syntax.nn_parser import Parser
 | 
					from spacy.syntax.nn_parser import Parser
 | 
				
			||||||
from spacy.tokens.doc import Doc
 | 
					from spacy.tokens.doc import Doc
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -52,7 +53,7 @@ def doc(vocab):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def gold(doc):
 | 
					def gold(doc):
 | 
				
			||||||
    return GoldParse(doc, heads=[1, 1, 1], deps=["L", "ROOT", "R"])
 | 
					    return {"heads": [1, 1, 1], "deps": ["L", "ROOT", "R"]}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_can_init_nn_parser(parser):
 | 
					def test_can_init_nn_parser(parser):
 | 
				
			||||||
| 
						 | 
					@ -77,7 +78,8 @@ def test_update_doc(parser, model, doc, gold):
 | 
				
			||||||
        weights -= 0.001 * gradient
 | 
					        weights -= 0.001 * gradient
 | 
				
			||||||
        return weights, gradient
 | 
					        return weights, gradient
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    parser.update((doc, gold), sgd=optimize)
 | 
					    example = Example.from_dict(doc, gold)
 | 
				
			||||||
 | 
					    parser.update([example], sgd=optimize)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.xfail
 | 
					@pytest.mark.xfail
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,107 +0,0 @@
 | 
				
			||||||
import pytest
 | 
					 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					 | 
				
			||||||
from spacy.language import Language
 | 
					 | 
				
			||||||
from spacy.pipeline.defaults import default_parser
 | 
					 | 
				
			||||||
from spacy.pipeline import DependencyParser
 | 
					 | 
				
			||||||
from spacy.syntax.arc_eager import ArcEager
 | 
					 | 
				
			||||||
from spacy.tokens import Doc
 | 
					 | 
				
			||||||
from spacy.syntax._beam_utils import ParserBeam
 | 
					 | 
				
			||||||
from spacy.syntax.stateclass import StateClass
 | 
					 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def vocab():
 | 
					 | 
				
			||||||
    return Vocab()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def moves(vocab):
 | 
					 | 
				
			||||||
    aeager = ArcEager(vocab.strings, {})
 | 
					 | 
				
			||||||
    aeager.add_action(2, "nsubj")
 | 
					 | 
				
			||||||
    aeager.add_action(3, "dobj")
 | 
					 | 
				
			||||||
    aeager.add_action(2, "aux")
 | 
					 | 
				
			||||||
    return aeager
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def docs(vocab):
 | 
					 | 
				
			||||||
    return [Doc(vocab, words=["Rats", "bite", "things"])]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def states(docs):
 | 
					 | 
				
			||||||
    return [StateClass(doc) for doc in docs]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def tokvecs(docs, vector_size):
 | 
					 | 
				
			||||||
    output = []
 | 
					 | 
				
			||||||
    for doc in docs:
 | 
					 | 
				
			||||||
        vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
 | 
					 | 
				
			||||||
        output.append(numpy.asarray(vec))
 | 
					 | 
				
			||||||
    return output
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def golds(docs):
 | 
					 | 
				
			||||||
    return [GoldParse(doc) for doc in docs]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def batch_size(docs):
 | 
					 | 
				
			||||||
    return len(docs)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def beam_width():
 | 
					 | 
				
			||||||
    return 4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def vector_size():
 | 
					 | 
				
			||||||
    return 6
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def beam(moves, states, golds, beam_width):
 | 
					 | 
				
			||||||
    return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.fixture
 | 
					 | 
				
			||||||
def scores(moves, batch_size, beam_width):
 | 
					 | 
				
			||||||
    return [
 | 
					 | 
				
			||||||
        numpy.asarray(
 | 
					 | 
				
			||||||
            numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), dtype="f"
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        for _ in range(batch_size)
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_create_beam(beam):
 | 
					 | 
				
			||||||
    pass
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_beam_advance(beam, scores):
 | 
					 | 
				
			||||||
    beam.advance(scores)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_beam_advance_too_few_scores(beam, scores):
 | 
					 | 
				
			||||||
    with pytest.raises(IndexError):
 | 
					 | 
				
			||||||
        beam.advance(scores[:-1])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_beam_parse():
 | 
					 | 
				
			||||||
    nlp = Language()
 | 
					 | 
				
			||||||
    config = {
 | 
					 | 
				
			||||||
        "learn_tokens": False,
 | 
					 | 
				
			||||||
        "min_action_freq": 30,
 | 
					 | 
				
			||||||
        "beam_width": 1,
 | 
					 | 
				
			||||||
        "beam_update_prob": 1.0,
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
 | 
					 | 
				
			||||||
    nlp.parser.add_label("nsubj")
 | 
					 | 
				
			||||||
    nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
 | 
					 | 
				
			||||||
    doc = nlp.make_doc("Australia is a country")
 | 
					 | 
				
			||||||
    nlp.parser(doc, beam_width=2)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -33,7 +33,7 @@ def test_parser_root(en_tokenizer):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.xfail
 | 
					@pytest.mark.xfail
 | 
				
			||||||
@pytest.mark.parametrize("text", ["Hello"])
 | 
					# @pytest.mark.parametrize("text", ["Hello"])
 | 
				
			||||||
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
 | 
					def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
 | 
				
			||||||
    tokens = en_tokenizer(text)
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
    doc = get_doc(
 | 
					    doc = get_doc(
 | 
				
			||||||
| 
						 | 
					@ -46,7 +46,8 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
 | 
				
			||||||
    assert doc[0].dep != 0
 | 
					    assert doc[0].dep != 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.xfail
 | 
					# We removed the step_through API a while ago. we should bring it back though
 | 
				
			||||||
 | 
					@pytest.mark.xfail(reason="Unsupported")
 | 
				
			||||||
def test_parser_initial(en_tokenizer, en_parser):
 | 
					def test_parser_initial(en_tokenizer, en_parser):
 | 
				
			||||||
    text = "I ate the pizza with anchovies."
 | 
					    text = "I ate the pizza with anchovies."
 | 
				
			||||||
    # heads = [1, 0, 1, -2, -3, -1, -5]
 | 
					    # heads = [1, 0, 1, -2, -3, -1, -5]
 | 
				
			||||||
| 
						 | 
					@ -90,8 +91,8 @@ def test_parser_merge_pp(en_tokenizer):
 | 
				
			||||||
    assert doc[2].text == "another phrase"
 | 
					    assert doc[2].text == "another phrase"
 | 
				
			||||||
    assert doc[3].text == "occurs"
 | 
					    assert doc[3].text == "occurs"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# We removed the step_through API a while ago. we should bring it back though
 | 
				
			||||||
@pytest.mark.xfail
 | 
					@pytest.mark.xfail(reason="Unsupported")
 | 
				
			||||||
def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
 | 
					def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
 | 
				
			||||||
    text = "a b c d e"
 | 
					    text = "a b c d e"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from thinc.api import Adam
 | 
					from thinc.api import Adam
 | 
				
			||||||
from spacy.attrs import NORM
 | 
					from spacy.attrs import NORM
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.gold import Example
 | 
				
			||||||
from spacy.pipeline.defaults import default_parser
 | 
					from spacy.pipeline.defaults import default_parser
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
from spacy.pipeline import DependencyParser
 | 
					from spacy.pipeline import DependencyParser
 | 
				
			||||||
| 
						 | 
					@ -33,8 +33,10 @@ def parser(vocab):
 | 
				
			||||||
    for i in range(10):
 | 
					    for i in range(10):
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        doc = Doc(vocab, words=["a", "b", "c", "d"])
 | 
					        doc = Doc(vocab, words=["a", "b", "c", "d"])
 | 
				
			||||||
        gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
 | 
					        example = Example.from_dict(
 | 
				
			||||||
        parser.update((doc, gold), sgd=sgd, losses=losses)
 | 
					            doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        parser.update([example], sgd=sgd, losses=losses)
 | 
				
			||||||
    return parser
 | 
					    return parser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -252,10 +252,18 @@ def test_preserving_links_ents_2(nlp):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
TRAIN_DATA = [
 | 
					TRAIN_DATA = [
 | 
				
			||||||
    ("Russ Cochran captured his first major title with his son as caddie.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
 | 
					    ("Russ Cochran captured his first major title with his son as caddie.",
 | 
				
			||||||
    ("Russ Cochran his reprints include EC Comics.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
 | 
					        {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
 | 
				
			||||||
    ("Russ Cochran has been publishing comic art.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
 | 
					         "entities": [(0, 12, "PERSON")]}),
 | 
				
			||||||
    ("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
 | 
					    ("Russ Cochran his reprints include EC Comics.",
 | 
				
			||||||
 | 
					        {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
 | 
				
			||||||
 | 
					         "entities": [(0, 12, "PERSON")]}),
 | 
				
			||||||
 | 
					    ("Russ Cochran has been publishing comic art.",
 | 
				
			||||||
 | 
					        {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
 | 
				
			||||||
 | 
					         "entities": [(0, 12, "PERSON")]}),
 | 
				
			||||||
 | 
					    ("Russ Cochran was a member of University of Kentucky's golf team.",
 | 
				
			||||||
 | 
					        {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
 | 
				
			||||||
 | 
					         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}),
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 | 
					GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 | 
				
			||||||
# fmt: on
 | 
					# fmt: on
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -53,7 +53,7 @@ def test_overfitting_IO():
 | 
				
			||||||
        "Feat=J|POS=ADJ",
 | 
					        "Feat=J|POS=ADJ",
 | 
				
			||||||
        "Feat=N|POS=NOUN",
 | 
					        "Feat=N|POS=NOUN",
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    assert gold_morphs == [t.morph_ for t in doc]
 | 
					    assert [t.morph_ for t in doc] == gold_morphs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Also test the results are still the same after IO
 | 
					    # Also test the results are still the same after IO
 | 
				
			||||||
    with make_tempdir() as tmp_dir:
 | 
					    with make_tempdir() as tmp_dir:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -26,7 +26,7 @@ def test_sentencizer_pipe():
 | 
				
			||||||
        sent_starts = [t.is_sent_start for t in doc]
 | 
					        sent_starts = [t.is_sent_start for t in doc]
 | 
				
			||||||
        assert sent_starts == [True, False, True, False, False, False, False]
 | 
					        assert sent_starts == [True, False, True, False, False, False, False]
 | 
				
			||||||
        assert len(list(doc.sents)) == 2
 | 
					        assert len(list(doc.sents)) == 2
 | 
				
			||||||
    for ex in nlp.pipe(texts, as_example=True):
 | 
					    for ex in nlp.pipe(texts):
 | 
				
			||||||
        doc = ex.doc
 | 
					        doc = ex.doc
 | 
				
			||||||
        assert doc.is_sentenced
 | 
					        assert doc.is_sentenced
 | 
				
			||||||
        sent_starts = [t.is_sent_start for t in doc]
 | 
					        sent_starts = [t.is_sent_start for t in doc]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,11 +7,11 @@ from spacy.lang.en import English
 | 
				
			||||||
from spacy.language import Language
 | 
					from spacy.language import Language
 | 
				
			||||||
from spacy.pipeline import TextCategorizer
 | 
					from spacy.pipeline import TextCategorizer
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
from spacy.util import fix_random_seed
 | 
					from spacy.util import fix_random_seed
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..util import make_tempdir
 | 
					from ..util import make_tempdir
 | 
				
			||||||
from spacy.pipeline.defaults import default_tok2vec
 | 
					from spacy.pipeline.defaults import default_tok2vec
 | 
				
			||||||
 | 
					from ...gold import Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TRAIN_DATA = [
 | 
					TRAIN_DATA = [
 | 
				
			||||||
    ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
 | 
					    ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
 | 
				
			||||||
| 
						 | 
					@ -51,21 +51,20 @@ def test_textcat_learns_multilabel():
 | 
				
			||||||
            cats = {letter: float(w2 == letter) for letter in letters}
 | 
					            cats = {letter: float(w2 == letter) for letter in letters}
 | 
				
			||||||
            docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
 | 
					            docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
 | 
				
			||||||
    random.shuffle(docs)
 | 
					    random.shuffle(docs)
 | 
				
			||||||
    model = TextCategorizer(nlp.vocab, width=8)
 | 
					    textcat = TextCategorizer(nlp.vocab, width=8)
 | 
				
			||||||
    for letter in letters:
 | 
					    for letter in letters:
 | 
				
			||||||
        model.add_label(letter)
 | 
					        textcat.add_label(letter)
 | 
				
			||||||
    optimizer = model.begin_training()
 | 
					    optimizer = textcat.begin_training()
 | 
				
			||||||
    for i in range(30):
 | 
					    for i in range(30):
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        Ys = [GoldParse(doc, cats=cats) for doc, cats in docs]
 | 
					        examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
 | 
				
			||||||
        Xs = [doc for doc, cats in docs]
 | 
					        textcat.update(examples, sgd=optimizer, losses=losses)
 | 
				
			||||||
        model.update(Xs, Ys, sgd=optimizer, losses=losses)
 | 
					 | 
				
			||||||
        random.shuffle(docs)
 | 
					        random.shuffle(docs)
 | 
				
			||||||
    for w1 in letters:
 | 
					    for w1 in letters:
 | 
				
			||||||
        for w2 in letters:
 | 
					        for w2 in letters:
 | 
				
			||||||
            doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
 | 
					            doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
 | 
				
			||||||
            truth = {letter: w2 == letter for letter in letters}
 | 
					            truth = {letter: w2 == letter for letter in letters}
 | 
				
			||||||
            model(doc)
 | 
					            textcat(doc)
 | 
				
			||||||
            for cat, score in doc.cats.items():
 | 
					            for cat, score in doc.cats.items():
 | 
				
			||||||
                if not truth[cat]:
 | 
					                if not truth[cat]:
 | 
				
			||||||
                    assert score < 0.5
 | 
					                    assert score < 0.5
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -277,11 +277,18 @@ def test_issue1967(label):
 | 
				
			||||||
        "beam_update_prob": 1.0,
 | 
					        "beam_update_prob": 1.0,
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    ner = EntityRecognizer(Vocab(), default_ner(), **config)
 | 
					    ner = EntityRecognizer(Vocab(), default_ner(), **config)
 | 
				
			||||||
    example = Example(doc=None)
 | 
					    example = Example.from_dict(
 | 
				
			||||||
    example.set_token_annotation(
 | 
					        Doc(ner.vocab, words=["word"]),
 | 
				
			||||||
        ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
 | 
					        {
 | 
				
			||||||
 | 
					            "ids": [0],
 | 
				
			||||||
 | 
					            "words": ["word"],
 | 
				
			||||||
 | 
					            "tags": ["tag"],
 | 
				
			||||||
 | 
					            "heads": [0],
 | 
				
			||||||
 | 
					            "deps": ["dep"],
 | 
				
			||||||
 | 
					            "entities": [label],
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    ner.moves.get_actions(gold_parses=[example])
 | 
					    assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_issue1971(en_vocab):
 | 
					def test_issue1971(en_vocab):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,7 @@
 | 
				
			||||||
from collections import defaultdict
 | 
					from collections import defaultdict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.pipeline.defaults import default_ner
 | 
					from spacy.pipeline.defaults import default_ner
 | 
				
			||||||
from spacy.pipeline import EntityRecognizer
 | 
					from spacy.pipeline import EntityRecognizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,6 +9,8 @@ from spacy.lang.en import English
 | 
				
			||||||
from spacy.tokens import Span
 | 
					from spacy.tokens import Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# skipped after removing Beam stuff during the Example/GoldParse refactor
 | 
				
			||||||
 | 
					@pytest.mark.skip
 | 
				
			||||||
def test_issue4313():
 | 
					def test_issue4313():
 | 
				
			||||||
    """ This should not crash or exit with some strange error code """
 | 
					    """ This should not crash or exit with some strange error code """
 | 
				
			||||||
    beam_width = 16
 | 
					    beam_width = 16
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,24 +1,31 @@
 | 
				
			||||||
import srsly
 | 
					from spacy.gold import Corpus
 | 
				
			||||||
from spacy.gold import GoldCorpus
 | 
					 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..util import make_tempdir
 | 
					from ..util import make_tempdir
 | 
				
			||||||
 | 
					from ...gold.converters import json2docs
 | 
				
			||||||
 | 
					from ...tokens import DocBin
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_issue4402():
 | 
					def test_issue4402():
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    with make_tempdir() as tmpdir:
 | 
					    with make_tempdir() as tmpdir:
 | 
				
			||||||
        json_path = tmpdir / "test4402.json"
 | 
					        output_file = tmpdir / "test4402.spacy"
 | 
				
			||||||
        srsly.write_json(json_path, json_data)
 | 
					        docs = json2docs([json_data])
 | 
				
			||||||
 | 
					        data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes()
 | 
				
			||||||
 | 
					        with output_file.open("wb") as file_:
 | 
				
			||||||
 | 
					            file_.write(data)
 | 
				
			||||||
 | 
					        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        corpus = GoldCorpus(str(json_path), str(json_path))
 | 
					        train_data = list(corpus.train_dataset(nlp))
 | 
				
			||||||
 | 
					        assert len(train_data) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0))
 | 
					        split_train_data = []
 | 
				
			||||||
        # assert that the data got split into 4 sentences
 | 
					        for eg in train_data:
 | 
				
			||||||
        assert len(train_data) == 4
 | 
					            split_train_data.extend(eg.split_sents())
 | 
				
			||||||
 | 
					        assert len(split_train_data) == 4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
json_data = [
 | 
					json_data =\
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
        "id": 0,
 | 
					        "id": 0,
 | 
				
			||||||
        "paragraphs": [
 | 
					        "paragraphs": [
 | 
				
			||||||
| 
						 | 
					@ -89,4 +96,3 @@ json_data = [
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
        ],
 | 
					        ],
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
]
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,6 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					
 | 
				
			||||||
 | 
					from spacy.gold import Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
| 
						 | 
					@ -7,4 +8,4 @@ from spacy.gold import GoldParse
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def test_gold_misaligned(en_tokenizer, text, words):
 | 
					def test_gold_misaligned(en_tokenizer, text, words):
 | 
				
			||||||
    doc = en_tokenizer(text)
 | 
					    doc = en_tokenizer(text)
 | 
				
			||||||
    GoldParse(doc, words=words)
 | 
					    Example.from_dict(doc, {"words": words})
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,7 @@
 | 
				
			||||||
from spacy.cli.converters.conllu2json import conllu2json
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO
 | 
				
			||||||
 | 
					# from spacy.gold.converters.conllu2docs import conllu2docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
input_data = """
 | 
					input_data = """
 | 
				
			||||||
1	[	_	PUNCT	-LRB-	_	_	punct	_	_
 | 
					1	[	_	PUNCT	-LRB-	_	_	punct	_	_
 | 
				
			||||||
| 
						 | 
					@ -22,10 +25,11 @@ input_data = """
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.xfail
 | 
				
			||||||
def test_issue4665():
 | 
					def test_issue4665():
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    conllu2json should not raise an exception if the HEAD column contains an
 | 
					    conllu2json should not raise an exception if the HEAD column contains an
 | 
				
			||||||
    underscore
 | 
					    underscore
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    pass
 | 
				
			||||||
    conllu2json(input_data)
 | 
					    # conllu2json(input_data)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,14 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.gold import docs_to_json
 | 
				
			||||||
 | 
					from spacy.gold.converters import iob2docs, conll_ner2docs
 | 
				
			||||||
 | 
					from spacy.gold.converters.conllu2json import conllu2json
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
from spacy.cli.converters import conllu2json, iob2json, conll_ner2json
 | 
					 | 
				
			||||||
from spacy.cli.pretrain import make_docs
 | 
					from spacy.cli.pretrain import make_docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO
 | 
				
			||||||
 | 
					# from spacy.gold.converters import conllu2docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_cli_converters_conllu2json():
 | 
					def test_cli_converters_conllu2json():
 | 
				
			||||||
    # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
 | 
					    # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
 | 
				
			||||||
| 
						 | 
					@ -109,7 +114,7 @@ def test_cli_converters_conllu2json_subtokens():
 | 
				
			||||||
    assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
 | 
					    assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_cli_converters_iob2json():
 | 
					def test_cli_converters_iob2json(en_vocab):
 | 
				
			||||||
    lines = [
 | 
					    lines = [
 | 
				
			||||||
        "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
 | 
					        "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
 | 
				
			||||||
        "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
 | 
					        "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
 | 
				
			||||||
| 
						 | 
					@ -117,19 +122,21 @@ def test_cli_converters_iob2json():
 | 
				
			||||||
        "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
 | 
					        "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    input_data = "\n".join(lines)
 | 
					    input_data = "\n".join(lines)
 | 
				
			||||||
    converted = iob2json(input_data, n_sents=10)
 | 
					    converted_docs = iob2docs(input_data, en_vocab, n_sents=10)
 | 
				
			||||||
    assert len(converted) == 1
 | 
					    assert len(converted_docs) == 1
 | 
				
			||||||
    assert converted[0]["id"] == 0
 | 
					    converted = docs_to_json(converted_docs)
 | 
				
			||||||
    assert len(converted[0]["paragraphs"]) == 1
 | 
					    assert converted["id"] == 0
 | 
				
			||||||
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 4
 | 
					    assert len(converted["paragraphs"]) == 1
 | 
				
			||||||
 | 
					    assert len(converted["paragraphs"][0]["sentences"]) == 4
 | 
				
			||||||
    for i in range(0, 4):
 | 
					    for i in range(0, 4):
 | 
				
			||||||
        sent = converted[0]["paragraphs"][0]["sentences"][i]
 | 
					        sent = converted["paragraphs"][0]["sentences"][i]
 | 
				
			||||||
        assert len(sent["tokens"]) == 8
 | 
					        assert len(sent["tokens"]) == 8
 | 
				
			||||||
        tokens = sent["tokens"]
 | 
					        tokens = sent["tokens"]
 | 
				
			||||||
        # fmt: off
 | 
					        # fmt: off
 | 
				
			||||||
        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
 | 
					        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
 | 
				
			||||||
        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
 | 
					    assert len(converted_docs[0].ents) == 8
 | 
				
			||||||
        # fmt: on
 | 
					    for ent in converted_docs[0].ents:
 | 
				
			||||||
 | 
					        assert(ent.text in ["New York City", "London"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_cli_converters_conll_ner2json():
 | 
					def test_cli_converters_conll_ner2json():
 | 
				
			||||||
| 
						 | 
					@ -182,19 +189,22 @@ def test_cli_converters_conll_ner2json():
 | 
				
			||||||
        ".\t.\t_\tO",
 | 
					        ".\t.\t_\tO",
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    input_data = "\n".join(lines)
 | 
					    input_data = "\n".join(lines)
 | 
				
			||||||
    converted = conll_ner2json(input_data, n_sents=10)
 | 
					    converted_docs = conll_ner2docs(input_data, n_sents=10)
 | 
				
			||||||
    assert len(converted) == 1
 | 
					    assert len(converted_docs) == 1
 | 
				
			||||||
    assert converted[0]["id"] == 0
 | 
					    converted = docs_to_json(converted_docs)
 | 
				
			||||||
    assert len(converted[0]["paragraphs"]) == 1
 | 
					    assert converted["id"] == 0
 | 
				
			||||||
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 5
 | 
					    assert len(converted["paragraphs"]) == 1
 | 
				
			||||||
 | 
					    assert len(converted["paragraphs"][0]["sentences"]) == 5
 | 
				
			||||||
    for i in range(0, 5):
 | 
					    for i in range(0, 5):
 | 
				
			||||||
        sent = converted[0]["paragraphs"][0]["sentences"][i]
 | 
					        sent = converted["paragraphs"][0]["sentences"][i]
 | 
				
			||||||
        assert len(sent["tokens"]) == 8
 | 
					        assert len(sent["tokens"]) == 8
 | 
				
			||||||
        tokens = sent["tokens"]
 | 
					        tokens = sent["tokens"]
 | 
				
			||||||
        # fmt: off
 | 
					        # fmt: off
 | 
				
			||||||
        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
 | 
					        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
 | 
				
			||||||
        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
 | 
					 | 
				
			||||||
        # fmt: on
 | 
					        # fmt: on
 | 
				
			||||||
 | 
					    assert len(converted_docs[0].ents) == 10
 | 
				
			||||||
 | 
					    for ent in converted_docs[0].ents:
 | 
				
			||||||
 | 
					        assert (ent.text in ["New York City", "London"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_pretrain_make_docs():
 | 
					def test_pretrain_make_docs():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,18 @@
 | 
				
			||||||
from spacy.errors import AlignmentError
 | 
					from spacy.errors import AlignmentError
 | 
				
			||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
 | 
					from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
 | 
				
			||||||
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align
 | 
					from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
 | 
				
			||||||
from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation
 | 
					from spacy.gold import Corpus, docs_to_json
 | 
				
			||||||
 | 
					from spacy.gold.example import Example
 | 
				
			||||||
 | 
					from spacy.gold.converters import json2docs
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
from spacy.syntax.nonproj import is_nonproj_tree
 | 
					from spacy.syntax.nonproj import is_nonproj_tree
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc, DocBin
 | 
				
			||||||
from spacy.util import get_words_and_spaces, compounding, minibatch
 | 
					from spacy.util import get_words_and_spaces, compounding, minibatch
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .util import make_tempdir
 | 
					from .util import make_tempdir
 | 
				
			||||||
 | 
					from ..gold.augment import make_orth_variants_example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
| 
						 | 
					@ -89,11 +92,18 @@ def merged_dict():
 | 
				
			||||||
    return {
 | 
					    return {
 | 
				
			||||||
        "ids": [1, 2, 3, 4, 5, 6, 7],
 | 
					        "ids": [1, 2, 3, 4, 5, 6, 7],
 | 
				
			||||||
        "words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
 | 
					        "words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
 | 
				
			||||||
 | 
					        "spaces": [True, True, True, True, True, True, False],
 | 
				
			||||||
        "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
 | 
					        "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
 | 
				
			||||||
        "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0],
 | 
					        "sent_starts": [1, 0, 0, 1, 0, 0, 0],
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def vocab():
 | 
				
			||||||
 | 
					    nlp = English()
 | 
				
			||||||
 | 
					    return nlp.vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_gold_biluo_U(en_vocab):
 | 
					def test_gold_biluo_U(en_vocab):
 | 
				
			||||||
    words = ["I", "flew", "to", "London", "."]
 | 
					    words = ["I", "flew", "to", "London", "."]
 | 
				
			||||||
    spaces = [True, True, True, False, True]
 | 
					    spaces = [True, True, True, False, True]
 | 
				
			||||||
| 
						 | 
					@ -143,38 +153,181 @@ def test_gold_biluo_misalign(en_vocab):
 | 
				
			||||||
    assert tags == ["O", "O", "O", "-", "-", "-"]
 | 
					    assert tags == ["O", "O", "O", "-", "-", "-"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_example_from_dict_no_ner(en_vocab):
 | 
				
			||||||
 | 
					    words = ["a", "b", "c", "d"]
 | 
				
			||||||
 | 
					    spaces = [True, True, False, True]
 | 
				
			||||||
 | 
					    predicted = Doc(en_vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, {"words": words})
 | 
				
			||||||
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
 | 
					    assert ner_tags == [None, None, None, None]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_example_from_dict_some_ner(en_vocab):
 | 
				
			||||||
 | 
					    words = ["a", "b", "c", "d"]
 | 
				
			||||||
 | 
					    spaces = [True, True, False, True]
 | 
				
			||||||
 | 
					    predicted = Doc(en_vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					    example = Example.from_dict(
 | 
				
			||||||
 | 
					        predicted,
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "words": words,
 | 
				
			||||||
 | 
					            "entities": ["U-LOC", None, None, None]
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
 | 
					    assert ner_tags == ["U-LOC", None, None, None]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_json2docs_no_ner(en_vocab):
 | 
				
			||||||
 | 
					    data = [{
 | 
				
			||||||
 | 
					        "id":1,
 | 
				
			||||||
 | 
					            "paragraphs":[
 | 
				
			||||||
 | 
					              {
 | 
				
			||||||
 | 
					                "sentences":[
 | 
				
			||||||
 | 
					                  {
 | 
				
			||||||
 | 
					                    "tokens":[
 | 
				
			||||||
 | 
					                      {
 | 
				
			||||||
 | 
					                        "dep":"nn",
 | 
				
			||||||
 | 
					                        "head":1,
 | 
				
			||||||
 | 
					                        "tag":"NNP",
 | 
				
			||||||
 | 
					                        "orth":"Ms."
 | 
				
			||||||
 | 
					                      },
 | 
				
			||||||
 | 
					                      {
 | 
				
			||||||
 | 
					                        "dep":"nsubj",
 | 
				
			||||||
 | 
					                        "head":1,
 | 
				
			||||||
 | 
					                        "tag":"NNP",
 | 
				
			||||||
 | 
					                        "orth":"Haag"
 | 
				
			||||||
 | 
					                      },
 | 
				
			||||||
 | 
					                      {
 | 
				
			||||||
 | 
					                        "dep":"ROOT",
 | 
				
			||||||
 | 
					                        "head":0,
 | 
				
			||||||
 | 
					                        "tag":"VBZ",
 | 
				
			||||||
 | 
					                        "orth":"plays"
 | 
				
			||||||
 | 
					                      },
 | 
				
			||||||
 | 
					                      {
 | 
				
			||||||
 | 
					                        "dep":"dobj",
 | 
				
			||||||
 | 
					                        "head":-1,
 | 
				
			||||||
 | 
					                        "tag":"NNP",
 | 
				
			||||||
 | 
					                        "orth":"Elianti"
 | 
				
			||||||
 | 
					                      },
 | 
				
			||||||
 | 
					                      {
 | 
				
			||||||
 | 
					                        "dep":"punct",
 | 
				
			||||||
 | 
					                        "head":-2,
 | 
				
			||||||
 | 
					                        "tag":".",
 | 
				
			||||||
 | 
					                        "orth":"."
 | 
				
			||||||
 | 
					                      }
 | 
				
			||||||
 | 
					                    ]
 | 
				
			||||||
 | 
					                  }
 | 
				
			||||||
 | 
					                ]
 | 
				
			||||||
 | 
					              }
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
 | 
					          }]
 | 
				
			||||||
 | 
					    docs = json2docs(data)
 | 
				
			||||||
 | 
					    assert len(docs) == 1
 | 
				
			||||||
 | 
					    for doc in docs:
 | 
				
			||||||
 | 
					        assert not doc.is_nered
 | 
				
			||||||
 | 
					    for token in doc:
 | 
				
			||||||
 | 
					        assert token.ent_iob == 0
 | 
				
			||||||
 | 
					    eg = Example(
 | 
				
			||||||
 | 
					        Doc(
 | 
				
			||||||
 | 
					            doc.vocab,
 | 
				
			||||||
 | 
					            words=[w.text for w in doc],
 | 
				
			||||||
 | 
					            spaces=[bool(w.whitespace_) for w in doc]
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        doc
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    ner_tags = eg.get_aligned_ner()
 | 
				
			||||||
 | 
					    assert ner_tags == [None, None, None, None, None]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					         
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_split_sentences(en_vocab):
 | 
				
			||||||
 | 
					    words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=words)
 | 
				
			||||||
 | 
					    gold_words = [
 | 
				
			||||||
 | 
					        "I",
 | 
				
			||||||
 | 
					        "flew",
 | 
				
			||||||
 | 
					        "to",
 | 
				
			||||||
 | 
					        "San",
 | 
				
			||||||
 | 
					        "Francisco",
 | 
				
			||||||
 | 
					        "Valley",
 | 
				
			||||||
 | 
					        "had",
 | 
				
			||||||
 | 
					        "loads",
 | 
				
			||||||
 | 
					        "of",
 | 
				
			||||||
 | 
					        "fun",
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    sent_starts = [True, False, False, False, False, False, True, False, False, False]
 | 
				
			||||||
 | 
					    example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
 | 
				
			||||||
 | 
					    assert example.text == "I flew to San Francisco Valley had loads of fun "
 | 
				
			||||||
 | 
					    split_examples = example.split_sents()
 | 
				
			||||||
 | 
					    assert len(split_examples) == 2
 | 
				
			||||||
 | 
					    assert split_examples[0].text == "I flew to San Francisco Valley "
 | 
				
			||||||
 | 
					    assert split_examples[1].text == "had loads of fun "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"]
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=words)
 | 
				
			||||||
 | 
					    gold_words = [
 | 
				
			||||||
 | 
					        "I",
 | 
				
			||||||
 | 
					        "flew",
 | 
				
			||||||
 | 
					        "to",
 | 
				
			||||||
 | 
					        "San Francisco",
 | 
				
			||||||
 | 
					        "Valley",
 | 
				
			||||||
 | 
					        "had",
 | 
				
			||||||
 | 
					        "loads of",
 | 
				
			||||||
 | 
					        "fun",
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    sent_starts = [True, False, False, False, False, True, False, False]
 | 
				
			||||||
 | 
					    example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
 | 
				
			||||||
 | 
					    assert example.text == "I flew to San Francisco Valley had loads of fun "
 | 
				
			||||||
 | 
					    split_examples = example.split_sents()
 | 
				
			||||||
 | 
					    assert len(split_examples) == 2
 | 
				
			||||||
 | 
					    assert split_examples[0].text == "I flew to San Francisco Valley "
 | 
				
			||||||
 | 
					    assert split_examples[1].text == "had loads of fun "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
					def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
				
			||||||
    # one-to-many
 | 
					    # one-to-many
 | 
				
			||||||
    words = ["I", "flew to", "San Francisco Valley", "."]
 | 
					    words = ["I", "flew to", "San Francisco Valley", "."]
 | 
				
			||||||
    spaces = [True, True, False, False]
 | 
					    spaces = [True, True, False, False]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
					    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
				
			||||||
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
 | 
					    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
 | 
				
			||||||
    gp = GoldParse(
 | 
					    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
 | 
				
			||||||
        doc,
 | 
					    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
 | 
				
			||||||
        words=["I", "flew", "to", "San", "Francisco", "Valley", "."],
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
        entities=entities,
 | 
					    assert ner_tags == ["O", None, "U-LOC", "O"]
 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    assert gp.ner == ["O", "O", "U-LOC", "O"]
 | 
					 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    # many-to-one
 | 
					    # many-to-one
 | 
				
			||||||
    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
 | 
					    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
 | 
				
			||||||
    spaces = [True, True, True, True, True, False, False]
 | 
					    spaces = [True, True, True, True, True, False, False]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
					    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
				
			||||||
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
 | 
					    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
 | 
				
			||||||
    gp = GoldParse(
 | 
					    gold_words = ["I", "flew to", "San Francisco Valley", "."]
 | 
				
			||||||
        doc, words=["I", "flew to", "San Francisco Valley", "."], entities=entities
 | 
					    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
 | 
				
			||||||
    )
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
    assert gp.ner == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 | 
					    assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # misaligned
 | 
					    # misaligned
 | 
				
			||||||
    words = ["I flew", "to", "San Francisco", "Valley", "."]
 | 
					    words = ["I flew", "to", "San Francisco", "Valley", "."]
 | 
				
			||||||
    spaces = [True, True, True, False, False]
 | 
					    spaces = [True, True, True, False, False]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
					    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
				
			||||||
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
 | 
					    offset_start = len("I flew to ")
 | 
				
			||||||
    gp = GoldParse(
 | 
					    offset_end = len("I flew to San Francisco Valley")
 | 
				
			||||||
        doc, words=["I", "flew to", "San", "Francisco Valley", "."], entities=entities,
 | 
					    entities = [(offset_start, offset_end, "LOC")]
 | 
				
			||||||
 | 
					    links = {(offset_start, offset_end): {"Q816843": 1.0}}
 | 
				
			||||||
 | 
					    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
 | 
				
			||||||
 | 
					    example = Example.from_dict(
 | 
				
			||||||
 | 
					        doc, {"words": gold_words, "entities": entities, "links": links}
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    assert gp.ner == ["O", "O", "B-LOC", "L-LOC", "O"]
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
 | 
					    assert ner_tags == [None, "O", "B-LOC", "L-LOC", "O"]
 | 
				
			||||||
 | 
					    #assert example.get_aligned("ENT_KB_ID", as_string=True) == [
 | 
				
			||||||
 | 
					    #    "",
 | 
				
			||||||
 | 
					    #    "",
 | 
				
			||||||
 | 
					    #    "Q816843",
 | 
				
			||||||
 | 
					    #    "Q816843",
 | 
				
			||||||
 | 
					    #    "",
 | 
				
			||||||
 | 
					    #]
 | 
				
			||||||
 | 
					    #assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {
 | 
				
			||||||
 | 
					    #    "Q816843": 1.0
 | 
				
			||||||
 | 
					    #}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # additional whitespace tokens in GoldParse words
 | 
					    # additional whitespace tokens in GoldParse words
 | 
				
			||||||
    words, spaces = get_words_and_spaces(
 | 
					    words, spaces = get_words_and_spaces(
 | 
				
			||||||
| 
						 | 
					@ -183,33 +336,34 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
					    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
				
			||||||
    entities = [(len("I flew  to "), len("I flew  to San Francisco Valley"), "LOC")]
 | 
					    entities = [(len("I flew  to "), len("I flew  to San Francisco Valley"), "LOC")]
 | 
				
			||||||
    gp = GoldParse(
 | 
					    gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
 | 
				
			||||||
        doc,
 | 
					    gold_spaces = [True, True, False, True, False, False]
 | 
				
			||||||
        words=["I", "flew", " ", "to", "San Francisco Valley", "."],
 | 
					    example = Example.from_dict(
 | 
				
			||||||
        entities=entities,
 | 
					        doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    assert gp.ner == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
 | 
					    assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # from issue #4791
 | 
					    # from issue #4791
 | 
				
			||||||
    data = (
 | 
					    doc = en_tokenizer("I'll return the ₹54 amount")
 | 
				
			||||||
        "I'll return the ₹54 amount",
 | 
					    gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
 | 
				
			||||||
        {
 | 
					    gold_spaces = [False, True, True, True, False, True, False]
 | 
				
			||||||
            "words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
 | 
					    entities = [(16, 19, "MONEY")]
 | 
				
			||||||
            "entities": [(16, 19, "MONEY")],
 | 
					    example = Example.from_dict(
 | 
				
			||||||
        },
 | 
					        doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    gp = GoldParse(en_tokenizer(data[0]), **data[1])
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
    assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"]
 | 
					    assert ner_tags == ["O", "O", "O", "O", "U-MONEY", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data = (
 | 
					    doc = en_tokenizer("I'll return the $54 amount")
 | 
				
			||||||
        "I'll return the $54 amount",
 | 
					    gold_words = ["I", "'ll", "return", "the", "$", "54", "amount"]
 | 
				
			||||||
        {
 | 
					    gold_spaces = [False, True, True, True, False, True, False]
 | 
				
			||||||
            "words": ["I", "'ll", "return", "the", "$", "54", "amount"],
 | 
					    entities = [(16, 19, "MONEY")]
 | 
				
			||||||
            "entities": [(16, 19, "MONEY")],
 | 
					    example = Example.from_dict(
 | 
				
			||||||
        },
 | 
					        doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities}
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    gp = GoldParse(en_tokenizer(data[0]), **data[1])
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
    assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"]
 | 
					    assert ner_tags == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
 | 
					def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
 | 
				
			||||||
| 
						 | 
					@ -220,6 +374,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
 | 
				
			||||||
    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
 | 
					    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
 | 
				
			||||||
    assert biluo_tags_converted == biluo_tags
 | 
					    assert biluo_tags_converted == biluo_tags
 | 
				
			||||||
    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
 | 
					    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
 | 
				
			||||||
 | 
					    offsets_converted = [ent for ent in offsets if ent[2]]
 | 
				
			||||||
    assert offsets_converted == offsets
 | 
					    assert offsets_converted == offsets
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -227,6 +382,7 @@ def test_biluo_spans(en_tokenizer):
 | 
				
			||||||
    doc = en_tokenizer("I flew to Silicon Valley via London.")
 | 
					    doc = en_tokenizer("I flew to Silicon Valley via London.")
 | 
				
			||||||
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
 | 
					    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
 | 
				
			||||||
    spans = spans_from_biluo_tags(doc, biluo_tags)
 | 
					    spans = spans_from_biluo_tags(doc, biluo_tags)
 | 
				
			||||||
 | 
					    spans = [span for span in spans if span.label_]
 | 
				
			||||||
    assert len(spans) == 2
 | 
					    assert len(spans) == 2
 | 
				
			||||||
    assert spans[0].text == "Silicon Valley"
 | 
					    assert spans[0].text == "Silicon Valley"
 | 
				
			||||||
    assert spans[0].label_ == "LOC"
 | 
					    assert spans[0].label_ == "LOC"
 | 
				
			||||||
| 
						 | 
					@ -237,7 +393,8 @@ def test_biluo_spans(en_tokenizer):
 | 
				
			||||||
def test_gold_ner_missing_tags(en_tokenizer):
 | 
					def test_gold_ner_missing_tags(en_tokenizer):
 | 
				
			||||||
    doc = en_tokenizer("I flew to Silicon Valley via London.")
 | 
					    doc = en_tokenizer("I flew to Silicon Valley via London.")
 | 
				
			||||||
    biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
 | 
					    biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
 | 
				
			||||||
    gold = GoldParse(doc, entities=biluo_tags)  # noqa: F841
 | 
					    example = Example.from_dict(doc, {"entities": biluo_tags})
 | 
				
			||||||
 | 
					    assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_iob_to_biluo():
 | 
					def test_iob_to_biluo():
 | 
				
			||||||
| 
						 | 
					@ -250,159 +407,98 @@ def test_iob_to_biluo():
 | 
				
			||||||
        iob_to_biluo(bad_iob)
 | 
					        iob_to_biluo(bad_iob)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_roundtrip_docs_to_json(doc):
 | 
					def test_roundtrip_docs_to_docbin(doc):
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    text = doc.text
 | 
					    text = doc.text
 | 
				
			||||||
 | 
					    idx = [t.idx for t in doc]
 | 
				
			||||||
    tags = [t.tag_ for t in doc]
 | 
					    tags = [t.tag_ for t in doc]
 | 
				
			||||||
    pos = [t.pos_ for t in doc]
 | 
					    pos = [t.pos_ for t in doc]
 | 
				
			||||||
    morphs = [t.morph_ for t in doc]
 | 
					    morphs = [t.morph_ for t in doc]
 | 
				
			||||||
    lemmas = [t.lemma_ for t in doc]
 | 
					    lemmas = [t.lemma_ for t in doc]
 | 
				
			||||||
    deps = [t.dep_ for t in doc]
 | 
					    deps = [t.dep_ for t in doc]
 | 
				
			||||||
    heads = [t.head.i for t in doc]
 | 
					    heads = [t.head.i for t in doc]
 | 
				
			||||||
    biluo_tags = iob_to_biluo(
 | 
					 | 
				
			||||||
        [t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc]
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    cats = doc.cats
 | 
					    cats = doc.cats
 | 
				
			||||||
 | 
					    ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # roundtrip to JSON
 | 
					    # roundtrip to DocBin
 | 
				
			||||||
    with make_tempdir() as tmpdir:
 | 
					    with make_tempdir() as tmpdir:
 | 
				
			||||||
        json_file = tmpdir / "roundtrip.json"
 | 
					        json_file = tmpdir / "roundtrip.json"
 | 
				
			||||||
        srsly.write_json(json_file, [docs_to_json(doc)])
 | 
					        srsly.write_json(json_file, [docs_to_json(doc)])
 | 
				
			||||||
        goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
 | 
					        goldcorpus = Corpus(str(json_file), str(json_file))
 | 
				
			||||||
 | 
					        output_file = tmpdir / "roundtrip.spacy"
 | 
				
			||||||
    reloaded_example = next(goldcorpus.dev_dataset(nlp))
 | 
					        data = DocBin(docs=[doc]).to_bytes()
 | 
				
			||||||
    goldparse = reloaded_example.gold
 | 
					        with output_file.open("wb") as file_:
 | 
				
			||||||
 | 
					            file_.write(data)
 | 
				
			||||||
    assert len(doc) == goldcorpus.count_train()
 | 
					        goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
 | 
				
			||||||
    assert text == reloaded_example.text
 | 
					        reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
 | 
				
			||||||
    assert tags == goldparse.tags
 | 
					        assert len(doc) == goldcorpus.count_train(nlp)
 | 
				
			||||||
    assert pos == goldparse.pos
 | 
					    assert text == reloaded_example.reference.text
 | 
				
			||||||
    assert morphs == goldparse.morphs
 | 
					    assert idx == [t.idx for t in reloaded_example.reference]
 | 
				
			||||||
    assert lemmas == goldparse.lemmas
 | 
					    assert tags == [t.tag_ for t in reloaded_example.reference]
 | 
				
			||||||
    assert deps == goldparse.labels
 | 
					    assert pos == [t.pos_ for t in reloaded_example.reference]
 | 
				
			||||||
    assert heads == goldparse.heads
 | 
					    assert morphs == [t.morph_ for t in reloaded_example.reference]
 | 
				
			||||||
    assert biluo_tags == goldparse.ner
 | 
					    assert lemmas == [t.lemma_ for t in reloaded_example.reference]
 | 
				
			||||||
    assert "TRAVEL" in goldparse.cats
 | 
					    assert deps == [t.dep_ for t in reloaded_example.reference]
 | 
				
			||||||
    assert "BAKING" in goldparse.cats
 | 
					    assert heads == [t.head.i for t in reloaded_example.reference]
 | 
				
			||||||
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
 | 
					    assert ents == [
 | 
				
			||||||
    assert cats["BAKING"] == goldparse.cats["BAKING"]
 | 
					        (e.start_char, e.end_char, e.label_) for e in reloaded_example.reference.ents
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
    # roundtrip to JSONL train dicts
 | 
					    assert "TRAVEL" in reloaded_example.reference.cats
 | 
				
			||||||
    with make_tempdir() as tmpdir:
 | 
					    assert "BAKING" in reloaded_example.reference.cats
 | 
				
			||||||
        jsonl_file = tmpdir / "roundtrip.jsonl"
 | 
					    assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
 | 
				
			||||||
        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
 | 
					    assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
 | 
				
			||||||
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    reloaded_example = next(goldcorpus.dev_dataset(nlp))
 | 
					 | 
				
			||||||
    goldparse = reloaded_example.gold
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    assert len(doc) == goldcorpus.count_train()
 | 
					 | 
				
			||||||
    assert text == reloaded_example.text
 | 
					 | 
				
			||||||
    assert tags == goldparse.tags
 | 
					 | 
				
			||||||
    assert pos == goldparse.pos
 | 
					 | 
				
			||||||
    assert morphs == goldparse.morphs
 | 
					 | 
				
			||||||
    assert lemmas == goldparse.lemmas
 | 
					 | 
				
			||||||
    assert deps == goldparse.labels
 | 
					 | 
				
			||||||
    assert heads == goldparse.heads
 | 
					 | 
				
			||||||
    assert biluo_tags == goldparse.ner
 | 
					 | 
				
			||||||
    assert "TRAVEL" in goldparse.cats
 | 
					 | 
				
			||||||
    assert "BAKING" in goldparse.cats
 | 
					 | 
				
			||||||
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
 | 
					 | 
				
			||||||
    assert cats["BAKING"] == goldparse.cats["BAKING"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # roundtrip to JSONL tuples
 | 
					 | 
				
			||||||
    with make_tempdir() as tmpdir:
 | 
					 | 
				
			||||||
        jsonl_file = tmpdir / "roundtrip.jsonl"
 | 
					 | 
				
			||||||
        # write to JSONL train dicts
 | 
					 | 
				
			||||||
        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
 | 
					 | 
				
			||||||
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
 | 
					 | 
				
			||||||
        # load and rewrite as JSONL tuples
 | 
					 | 
				
			||||||
        srsly.write_jsonl(jsonl_file, goldcorpus.train_examples)
 | 
					 | 
				
			||||||
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    reloaded_example = next(goldcorpus.dev_dataset(nlp))
 | 
					 | 
				
			||||||
    goldparse = reloaded_example.gold
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    assert len(doc) == goldcorpus.count_train()
 | 
					 | 
				
			||||||
    assert text == reloaded_example.text
 | 
					 | 
				
			||||||
    assert tags == goldparse.tags
 | 
					 | 
				
			||||||
    assert deps == goldparse.labels
 | 
					 | 
				
			||||||
    assert heads == goldparse.heads
 | 
					 | 
				
			||||||
    assert lemmas == goldparse.lemmas
 | 
					 | 
				
			||||||
    assert biluo_tags == goldparse.ner
 | 
					 | 
				
			||||||
    assert "TRAVEL" in goldparse.cats
 | 
					 | 
				
			||||||
    assert "BAKING" in goldparse.cats
 | 
					 | 
				
			||||||
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
 | 
					 | 
				
			||||||
    assert cats["BAKING"] == goldparse.cats["BAKING"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_projective_train_vs_nonprojective_dev(doc):
 | 
					 | 
				
			||||||
    nlp = English()
 | 
					 | 
				
			||||||
    deps = [t.dep_ for t in doc]
 | 
					 | 
				
			||||||
    heads = [t.head.i for t in doc]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    with make_tempdir() as tmpdir:
 | 
					 | 
				
			||||||
        jsonl_file = tmpdir / "test.jsonl"
 | 
					 | 
				
			||||||
        # write to JSONL train dicts
 | 
					 | 
				
			||||||
        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
 | 
					 | 
				
			||||||
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    train_reloaded_example = next(goldcorpus.train_dataset(nlp))
 | 
					 | 
				
			||||||
    train_goldparse = train_reloaded_example.gold
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
 | 
					 | 
				
			||||||
    dev_goldparse = dev_reloaded_example.gold
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    assert is_nonproj_tree([t.head.i for t in doc]) is True
 | 
					 | 
				
			||||||
    assert is_nonproj_tree(train_goldparse.heads) is False
 | 
					 | 
				
			||||||
    assert heads[:-1] == train_goldparse.heads[:-1]
 | 
					 | 
				
			||||||
    assert heads[-1] != train_goldparse.heads[-1]
 | 
					 | 
				
			||||||
    assert deps[:-1] == train_goldparse.labels[:-1]
 | 
					 | 
				
			||||||
    assert deps[-1] != train_goldparse.labels[-1]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    assert heads == dev_goldparse.heads
 | 
					 | 
				
			||||||
    assert deps == dev_goldparse.labels
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Hm, not sure where misalignment check would be handled? In the components too?
 | 
				
			||||||
 | 
					# I guess that does make sense. A text categorizer doesn't care if it's
 | 
				
			||||||
 | 
					# misaligned...
 | 
				
			||||||
 | 
					@pytest.mark.xfail(reason="Outdated")
 | 
				
			||||||
def test_ignore_misaligned(doc):
 | 
					def test_ignore_misaligned(doc):
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    text = doc.text
 | 
					    text = doc.text
 | 
				
			||||||
    with make_tempdir() as tmpdir:
 | 
					    with make_tempdir() as tmpdir:
 | 
				
			||||||
        jsonl_file = tmpdir / "test.jsonl"
 | 
					        json_file = tmpdir / "test.json"
 | 
				
			||||||
        data = [docs_to_json(doc)]
 | 
					        data = [docs_to_json(doc)]
 | 
				
			||||||
        data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
 | 
					        data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
 | 
				
			||||||
        # write to JSONL train dicts
 | 
					        # write to JSON train dicts
 | 
				
			||||||
        srsly.write_jsonl(jsonl_file, data)
 | 
					        srsly.write_json(json_file, data)
 | 
				
			||||||
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
 | 
					        goldcorpus = Corpus(str(json_file), str(json_file))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with pytest.raises(AlignmentError):
 | 
					        with pytest.raises(AlignmentError):
 | 
				
			||||||
        train_reloaded_example = next(goldcorpus.train_dataset(nlp))
 | 
					            train_reloaded_example = next(goldcorpus.train_dataset(nlp))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with make_tempdir() as tmpdir:
 | 
					    with make_tempdir() as tmpdir:
 | 
				
			||||||
        jsonl_file = tmpdir / "test.jsonl"
 | 
					        json_file = tmpdir / "test.json"
 | 
				
			||||||
        data = [docs_to_json(doc)]
 | 
					        data = [docs_to_json(doc)]
 | 
				
			||||||
        data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
 | 
					        data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
 | 
				
			||||||
        # write to JSONL train dicts
 | 
					        # write to JSON train dicts
 | 
				
			||||||
        srsly.write_jsonl(jsonl_file, data)
 | 
					        srsly.write_json(json_file, data)
 | 
				
			||||||
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
 | 
					        goldcorpus = Corpus(str(json_file), str(json_file))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # doesn't raise an AlignmentError, but there is nothing to iterate over
 | 
					        # doesn't raise an AlignmentError, but there is nothing to iterate over
 | 
				
			||||||
    # because the only example can't be aligned
 | 
					        # because the only example can't be aligned
 | 
				
			||||||
    train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True))
 | 
					        train_reloaded_example = list(
 | 
				
			||||||
    assert len(train_reloaded_example) == 0
 | 
					            goldcorpus.train_dataset(nlp, ignore_misaligned=True)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        assert len(train_reloaded_example) == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# We probably want the orth variant logic back, but this test won't be quite
 | 
				
			||||||
 | 
					# right -- we need to go from DocBin.
 | 
				
			||||||
def test_make_orth_variants(doc):
 | 
					def test_make_orth_variants(doc):
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    with make_tempdir() as tmpdir:
 | 
					    with make_tempdir() as tmpdir:
 | 
				
			||||||
        jsonl_file = tmpdir / "test.jsonl"
 | 
					        output_file = tmpdir / "roundtrip.spacy"
 | 
				
			||||||
        # write to JSONL train dicts
 | 
					        data = DocBin(docs=[doc]).to_bytes()
 | 
				
			||||||
        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
 | 
					        with output_file.open("wb") as file_:
 | 
				
			||||||
        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
 | 
					            file_.write(data)
 | 
				
			||||||
 | 
					        goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # due to randomness, test only that this runs with no errors for now
 | 
					        # due to randomness, test only that this runs with no errors for now
 | 
				
			||||||
    train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
 | 
					        train_example = next(goldcorpus.train_dataset(nlp))
 | 
				
			||||||
    train_goldparse = train_reloaded_example.gold  # noqa: F841
 | 
					        variant_example = make_orth_variants_example(
 | 
				
			||||||
 | 
					            nlp, train_example, orth_variant_level=0.2
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
| 
						 | 
					@ -439,39 +535,35 @@ def test_align(tokens_a, tokens_b, expected):
 | 
				
			||||||
def test_goldparse_startswith_space(en_tokenizer):
 | 
					def test_goldparse_startswith_space(en_tokenizer):
 | 
				
			||||||
    text = " a"
 | 
					    text = " a"
 | 
				
			||||||
    doc = en_tokenizer(text)
 | 
					    doc = en_tokenizer(text)
 | 
				
			||||||
    g = GoldParse(doc, words=["a"], entities=["U-DATE"], deps=["ROOT"], heads=[0])
 | 
					    gold_words = ["a"]
 | 
				
			||||||
    assert g.words == [" ", "a"]
 | 
					    entities = ["U-DATE"]
 | 
				
			||||||
    assert g.ner == [None, "U-DATE"]
 | 
					    deps = ["ROOT"]
 | 
				
			||||||
    assert g.labels == [None, "ROOT"]
 | 
					    heads = [0]
 | 
				
			||||||
 | 
					    example = Example.from_dict(
 | 
				
			||||||
 | 
					        doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
 | 
					    assert ner_tags == [None, "U-DATE"]
 | 
				
			||||||
 | 
					    assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_gold_constructor():
 | 
					def test_gold_constructor():
 | 
				
			||||||
    """Test that the GoldParse constructor works fine"""
 | 
					    """Test that the Example constructor works fine"""
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    doc = nlp("This is a sentence")
 | 
					    doc = nlp("This is a sentence")
 | 
				
			||||||
    gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0})
 | 
					    example = Example.from_dict(doc, {"cats": {"cat1": 1.0, "cat2": 0.0}})
 | 
				
			||||||
 | 
					    assert example.get_aligned("ORTH", as_string=True) == [
 | 
				
			||||||
    assert gold.cats["cat1"]
 | 
					        "This",
 | 
				
			||||||
    assert not gold.cats["cat2"]
 | 
					        "is",
 | 
				
			||||||
    assert gold.words == ["This", "is", "a", "sentence"]
 | 
					        "a",
 | 
				
			||||||
 | 
					        "sentence",
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
def test_gold_orig_annot():
 | 
					    assert example.reference.cats["cat1"]
 | 
				
			||||||
    nlp = English()
 | 
					    assert not example.reference.cats["cat2"]
 | 
				
			||||||
    doc = nlp("This is a sentence")
 | 
					 | 
				
			||||||
    gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0})
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    assert gold.orig.words == ["This", "is", "a", "sentence"]
 | 
					 | 
				
			||||||
    assert gold.cats["cat1"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    doc_annotation = DocAnnotation(cats={"cat1": 0.0, "cat2": 1.0})
 | 
					 | 
				
			||||||
    gold2 = GoldParse.from_annotation(doc, doc_annotation, gold.orig)
 | 
					 | 
				
			||||||
    assert gold2.orig.words == ["This", "is", "a", "sentence"]
 | 
					 | 
				
			||||||
    assert not gold2.cats["cat1"]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_tuple_format_implicit():
 | 
					def test_tuple_format_implicit():
 | 
				
			||||||
    """Test tuple format with implicit GoldParse creation"""
 | 
					    """Test tuple format"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    train_data = [
 | 
					    train_data = [
 | 
				
			||||||
        ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
 | 
					        ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
 | 
				
			||||||
| 
						 | 
					@ -486,7 +578,7 @@ def test_tuple_format_implicit():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_tuple_format_implicit_invalid():
 | 
					def test_tuple_format_implicit_invalid():
 | 
				
			||||||
    """Test that an error is thrown for an implicit invalid GoldParse field"""
 | 
					    """Test that an error is thrown for an implicit invalid field"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    train_data = [
 | 
					    train_data = [
 | 
				
			||||||
        ("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
 | 
					        ("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
 | 
				
			||||||
| 
						 | 
					@ -497,10 +589,11 @@ def test_tuple_format_implicit_invalid():
 | 
				
			||||||
        ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
 | 
					        ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with pytest.raises(TypeError):
 | 
					    with pytest.raises(KeyError):
 | 
				
			||||||
        _train(train_data)
 | 
					        _train(train_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _train(train_data):
 | 
					def _train(train_data):
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    ner = nlp.create_pipe("ner")
 | 
					    ner = nlp.create_pipe("ner")
 | 
				
			||||||
| 
						 | 
					@ -518,43 +611,23 @@ def _train(train_data):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_split_sents(merged_dict):
 | 
					def test_split_sents(merged_dict):
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    example = Example()
 | 
					    example = Example.from_dict(
 | 
				
			||||||
    example.set_token_annotation(**merged_dict)
 | 
					        Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]),
 | 
				
			||||||
    assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
 | 
					        merged_dict,
 | 
				
			||||||
    assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1
 | 
					    )
 | 
				
			||||||
 | 
					    assert example.text == "Hi there everyone It is just me"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    split_examples = example.split_sents()
 | 
					    split_examples = example.split_sents()
 | 
				
			||||||
    assert len(split_examples) == 2
 | 
					    assert len(split_examples) == 2
 | 
				
			||||||
 | 
					    assert split_examples[0].text == "Hi there everyone "
 | 
				
			||||||
 | 
					    assert split_examples[1].text == "It is just me"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    token_annotation_1 = split_examples[0].token_annotation
 | 
					    token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
 | 
				
			||||||
    assert token_annotation_1.ids == [1, 2, 3]
 | 
					    assert token_annotation_1["words"] == ["Hi", "there", "everyone"]
 | 
				
			||||||
    assert token_annotation_1.words == ["Hi", "there", "everyone"]
 | 
					    assert token_annotation_1["tags"] == ["INTJ", "ADV", "PRON"]
 | 
				
			||||||
    assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
 | 
					    assert token_annotation_1["sent_starts"] == [1, 0, 0]
 | 
				
			||||||
    assert token_annotation_1.sent_starts == [1, 0, 0]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    token_annotation_2 = split_examples[1].token_annotation
 | 
					    token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
 | 
				
			||||||
    assert token_annotation_2.ids == [4, 5, 6, 7]
 | 
					    assert token_annotation_2["words"] == ["It", "is", "just", "me"]
 | 
				
			||||||
    assert token_annotation_2.words == ["It", "is", "just", "me"]
 | 
					    assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"]
 | 
				
			||||||
    assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"]
 | 
					    assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
 | 
				
			||||||
    assert token_annotation_2.sent_starts == [1, 0, 0, 0]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_tuples_to_example(merged_dict):
 | 
					 | 
				
			||||||
    ex = Example()
 | 
					 | 
				
			||||||
    ex.set_token_annotation(**merged_dict)
 | 
					 | 
				
			||||||
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
 | 
					 | 
				
			||||||
    ex.set_doc_annotation(cats=cats)
 | 
					 | 
				
			||||||
    ex_dict = ex.to_dict()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
 | 
					 | 
				
			||||||
    assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
 | 
					 | 
				
			||||||
    assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
 | 
					 | 
				
			||||||
    assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
 | 
					 | 
				
			||||||
    assert ex_dict["doc_annotation"]["cats"] == cats
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_empty_example_goldparse():
 | 
					 | 
				
			||||||
    nlp = English()
 | 
					 | 
				
			||||||
    doc = nlp("")
 | 
					 | 
				
			||||||
    example = Example(doc=doc)
 | 
					 | 
				
			||||||
    assert len(example.get_gold_parses()) == 1
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
from spacy.language import Language
 | 
					from spacy.language import Language
 | 
				
			||||||
from spacy.tokens import Doc, Span
 | 
					from spacy.tokens import Doc, Span
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
| 
						 | 
					@ -24,40 +23,27 @@ def test_language_update(nlp):
 | 
				
			||||||
    annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
 | 
					    annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
 | 
				
			||||||
    wrongkeyannots = {"LABEL": True}
 | 
					    wrongkeyannots = {"LABEL": True}
 | 
				
			||||||
    doc = Doc(nlp.vocab, words=text.split(" "))
 | 
					    doc = Doc(nlp.vocab, words=text.split(" "))
 | 
				
			||||||
    gold = GoldParse(doc, **annots)
 | 
					 | 
				
			||||||
    # Update with doc and gold objects
 | 
					 | 
				
			||||||
    nlp.update((doc, gold))
 | 
					 | 
				
			||||||
    # Update with text and dict
 | 
					    # Update with text and dict
 | 
				
			||||||
    nlp.update((text, annots))
 | 
					    nlp.update((text, annots))
 | 
				
			||||||
    # Update with doc object and dict
 | 
					    # Update with doc object and dict
 | 
				
			||||||
    nlp.update((doc, annots))
 | 
					    nlp.update((doc, annots))
 | 
				
			||||||
    # Update with text and gold object
 | 
					 | 
				
			||||||
    nlp.update((text, gold))
 | 
					 | 
				
			||||||
    # Update with empty doc and gold object
 | 
					 | 
				
			||||||
    nlp.update((None, gold))
 | 
					 | 
				
			||||||
    # Update badly
 | 
					    # Update badly
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        nlp.update((doc, None))
 | 
					        nlp.update((doc, None))
 | 
				
			||||||
    with pytest.raises(TypeError):
 | 
					    with pytest.raises(KeyError):
 | 
				
			||||||
        nlp.update((text, wrongkeyannots))
 | 
					        nlp.update((text, wrongkeyannots))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_language_evaluate(nlp):
 | 
					def test_language_evaluate(nlp):
 | 
				
			||||||
    text = "hello world"
 | 
					    text = "hello world"
 | 
				
			||||||
    annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
 | 
					    annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
 | 
				
			||||||
    doc = Doc(nlp.vocab, words=text.split(" "))
 | 
					    doc = Doc(nlp.vocab, words=text.split(" "))
 | 
				
			||||||
    gold = GoldParse(doc, **annots)
 | 
					 | 
				
			||||||
    # Evaluate with doc and gold objects
 | 
					 | 
				
			||||||
    nlp.evaluate([(doc, gold)])
 | 
					 | 
				
			||||||
    # Evaluate with text and dict
 | 
					    # Evaluate with text and dict
 | 
				
			||||||
    nlp.evaluate([(text, annots)])
 | 
					    nlp.evaluate([(text, annots)])
 | 
				
			||||||
    # Evaluate with doc object and dict
 | 
					    # Evaluate with doc object and dict
 | 
				
			||||||
    nlp.evaluate([(doc, annots)])
 | 
					    nlp.evaluate([(doc, annots)])
 | 
				
			||||||
    # Evaluate with text and gold object
 | 
					 | 
				
			||||||
    nlp.evaluate([(text, gold)])
 | 
					 | 
				
			||||||
    # Evaluate badly
 | 
					 | 
				
			||||||
    with pytest.raises(Exception):
 | 
					    with pytest.raises(Exception):
 | 
				
			||||||
        nlp.evaluate([text, gold])
 | 
					        nlp.evaluate([text, annots])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_evaluate_no_pipe(nlp):
 | 
					def test_evaluate_no_pipe(nlp):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										242
									
								
								spacy/tests/test_new_example.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										242
									
								
								spacy/tests/test_new_example.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,242 @@
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					from spacy.gold.example import Example
 | 
				
			||||||
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_Example_init_requires_doc_objects():
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    with pytest.raises(TypeError):
 | 
				
			||||||
 | 
					        example = Example(None, None)
 | 
				
			||||||
 | 
					    with pytest.raises(TypeError):
 | 
				
			||||||
 | 
					        example = Example(Doc(vocab, words=["hi"]), None)
 | 
				
			||||||
 | 
					    with pytest.raises(TypeError):
 | 
				
			||||||
 | 
					        example = Example(None, Doc(vocab, words=["hi"]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_Example_from_dict_basic():
 | 
				
			||||||
 | 
					    example = Example.from_dict(
 | 
				
			||||||
 | 
					        Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assert isinstance(example.x, Doc)
 | 
				
			||||||
 | 
					    assert isinstance(example.y, Doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "annots", [{"words": ["ice", "cream"], "weirdannots": ["something", "such"]}]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_Example_from_dict_invalid(annots):
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=annots["words"])
 | 
				
			||||||
 | 
					    with pytest.raises(KeyError):
 | 
				
			||||||
 | 
					        Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "pred_words", [["ice", "cream"], ["icecream"], ["i", "ce", "cream"]]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("annots", [{"words": ["icecream"], "tags": ["NN"]}])
 | 
				
			||||||
 | 
					def test_Example_from_dict_with_tags(pred_words, annots):
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=pred_words)
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					    for i, token in enumerate(example.reference):
 | 
				
			||||||
 | 
					        assert token.tag_ == annots["tags"][i]
 | 
				
			||||||
 | 
					    aligned_tags = example.get_aligned("tag", as_string=True)
 | 
				
			||||||
 | 
					    assert aligned_tags == ["NN" for _ in predicted]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_aligned_tags():
 | 
				
			||||||
 | 
					    pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"]
 | 
				
			||||||
 | 
					    gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"]
 | 
				
			||||||
 | 
					    gold_tags = ["VERB", "DET", "NOUN", "NOUN", "SCONJ", "PRON", "VERB"]
 | 
				
			||||||
 | 
					    annots = {"words": gold_words, "tags": gold_tags}
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=pred_words)
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					    aligned_tags = example.get_aligned("tag", as_string=True)
 | 
				
			||||||
 | 
					    assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_aligned_tags_multi():
 | 
				
			||||||
 | 
					    pred_words = ["Applysome", "sunscreen", "unless", "you", "can", "not"]
 | 
				
			||||||
 | 
					    gold_words = ["Apply", "somesun", "screen", "unless", "you", "cannot"]
 | 
				
			||||||
 | 
					    gold_tags = ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB"]
 | 
				
			||||||
 | 
					    annots = {"words": gold_words, "tags": gold_tags}
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=pred_words)
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					    aligned_tags = example.get_aligned("tag", as_string=True)
 | 
				
			||||||
 | 
					    assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "annots",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "words": ["I", "like", "London", "and", "Berlin", "."],
 | 
				
			||||||
 | 
					            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
 | 
				
			||||||
 | 
					            "heads": [1, 1, 1, 2, 2, 1],
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_Example_from_dict_with_parse(annots):
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=annots["words"])
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					    for i, token in enumerate(example.reference):
 | 
				
			||||||
 | 
					        assert token.dep_ == annots["deps"][i]
 | 
				
			||||||
 | 
					        assert token.head.i == annots["heads"][i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "annots",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "words": ["Sarah", "'s", "sister", "flew"],
 | 
				
			||||||
 | 
					            "morphs": [
 | 
				
			||||||
 | 
					                "NounType=prop|Number=sing",
 | 
				
			||||||
 | 
					                "Poss=yes",
 | 
				
			||||||
 | 
					                "Number=sing",
 | 
				
			||||||
 | 
					                "Tense=past|VerbForm=fin",
 | 
				
			||||||
 | 
					            ],
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_Example_from_dict_with_morphology(annots):
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=annots["words"])
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					    for i, token in enumerate(example.reference):
 | 
				
			||||||
 | 
					        assert token.morph_ == annots["morphs"][i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "annots",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "words": ["This", "is", "one", "sentence", "this", "is", "another"],
 | 
				
			||||||
 | 
					            "sent_starts": [1, 0, 0, 0, 1, 0, 0],
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_Example_from_dict_with_sent_start(annots):
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=annots["words"])
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					    assert len(list(example.reference.sents)) == 2
 | 
				
			||||||
 | 
					    for i, token in enumerate(example.reference):
 | 
				
			||||||
 | 
					        assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "annots",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "words": ["This", "is", "a", "sentence"],
 | 
				
			||||||
 | 
					            "cats": {"cat1": 1.0, "cat2": 0.0, "cat3": 0.5},
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_Example_from_dict_with_cats(annots):
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=annots["words"])
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					    assert len(list(example.reference.cats)) == 3
 | 
				
			||||||
 | 
					    assert example.reference.cats["cat1"] == 1.0
 | 
				
			||||||
 | 
					    assert example.reference.cats["cat2"] == 0.0
 | 
				
			||||||
 | 
					    assert example.reference.cats["cat3"] == 0.5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "annots",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | 
				
			||||||
 | 
					            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_Example_from_dict_with_entities(annots):
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=annots["words"])
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert len(list(example.reference.ents)) == 2
 | 
				
			||||||
 | 
					    assert [example.reference[i].ent_iob_ for i in range(7)] == [
 | 
				
			||||||
 | 
					        "O",
 | 
				
			||||||
 | 
					        "O",
 | 
				
			||||||
 | 
					        "B",
 | 
				
			||||||
 | 
					        "I",
 | 
				
			||||||
 | 
					        "O",
 | 
				
			||||||
 | 
					        "B",
 | 
				
			||||||
 | 
					        "O",
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert example.reference[2].ent_type_ == "LOC"
 | 
				
			||||||
 | 
					    assert example.reference[3].ent_type_ == "LOC"
 | 
				
			||||||
 | 
					    assert example.reference[5].ent_type_ == "LOC"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "annots",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | 
				
			||||||
 | 
					            "entities": [
 | 
				
			||||||
 | 
					                (0, 4, "LOC"),
 | 
				
			||||||
 | 
					                (21, 27, "LOC"),
 | 
				
			||||||
 | 
					            ],  # not aligned to token boundaries
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_Example_from_dict_with_entities_invalid(annots):
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=annots["words"])
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					    # TODO: shouldn't this throw some sort of warning ?
 | 
				
			||||||
 | 
					    assert len(list(example.reference.ents)) == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "annots",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | 
				
			||||||
 | 
					            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
 | 
				
			||||||
 | 
					            "links": {
 | 
				
			||||||
 | 
					                (7, 15): {"Q60": 1.0, "Q64": 0.0},
 | 
				
			||||||
 | 
					                (20, 26): {"Q60": 0.0, "Q64": 1.0},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_Example_from_dict_with_links(annots):
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=annots["words"])
 | 
				
			||||||
 | 
					    example = Example.from_dict(predicted, annots)
 | 
				
			||||||
 | 
					    assert example.reference[0].ent_kb_id_ == ""
 | 
				
			||||||
 | 
					    assert example.reference[1].ent_kb_id_ == ""
 | 
				
			||||||
 | 
					    assert example.reference[2].ent_kb_id_ == "Q60"
 | 
				
			||||||
 | 
					    assert example.reference[3].ent_kb_id_ == "Q60"
 | 
				
			||||||
 | 
					    assert example.reference[4].ent_kb_id_ == ""
 | 
				
			||||||
 | 
					    assert example.reference[5].ent_kb_id_ == "Q64"
 | 
				
			||||||
 | 
					    assert example.reference[6].ent_kb_id_ == ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "annots",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
 | 
				
			||||||
 | 
					            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
 | 
				
			||||||
 | 
					            "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_Example_from_dict_with_links_invalid(annots):
 | 
				
			||||||
 | 
					    vocab = Vocab()
 | 
				
			||||||
 | 
					    predicted = Doc(vocab, words=annots["words"])
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        Example.from_dict(predicted, annots)
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,14 @@
 | 
				
			||||||
from numpy.testing import assert_almost_equal, assert_array_almost_equal
 | 
					from numpy.testing import assert_almost_equal, assert_array_almost_equal
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from pytest import approx
 | 
					from pytest import approx
 | 
				
			||||||
from spacy.gold import Example, GoldParse
 | 
					from spacy.gold import Example
 | 
				
			||||||
 | 
					from spacy.gold.iob_utils import biluo_tags_from_offsets
 | 
				
			||||||
from spacy.scorer import Scorer, ROCAUCScore
 | 
					from spacy.scorer import Scorer, ROCAUCScore
 | 
				
			||||||
from spacy.scorer import _roc_auc_score, _roc_curve
 | 
					from spacy.scorer import _roc_auc_score, _roc_curve
 | 
				
			||||||
from .util import get_doc
 | 
					from .util import get_doc
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
test_las_apple = [
 | 
					test_las_apple = [
 | 
				
			||||||
    [
 | 
					    [
 | 
				
			||||||
        "Apple is looking at buying U.K. startup for $ 1 billion",
 | 
					        "Apple is looking at buying U.K. startup for $ 1 billion",
 | 
				
			||||||
| 
						 | 
					@ -89,8 +91,9 @@ def test_las_per_type(en_vocab):
 | 
				
			||||||
            heads=([h - i for i, h in enumerate(annot["heads"])]),
 | 
					            heads=([h - i for i, h in enumerate(annot["heads"])]),
 | 
				
			||||||
            deps=annot["deps"],
 | 
					            deps=annot["deps"],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
 | 
					        gold = {"heads": annot["heads"], "deps": annot["deps"]}
 | 
				
			||||||
        scorer.score((doc, gold))
 | 
					        example = Example.from_dict(doc, gold)
 | 
				
			||||||
 | 
					        scorer.score(example)
 | 
				
			||||||
    results = scorer.scores
 | 
					    results = scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert results["uas"] == 100
 | 
					    assert results["uas"] == 100
 | 
				
			||||||
| 
						 | 
					@ -111,9 +114,10 @@ def test_las_per_type(en_vocab):
 | 
				
			||||||
            heads=([h - i for i, h in enumerate(annot["heads"])]),
 | 
					            heads=([h - i for i, h in enumerate(annot["heads"])]),
 | 
				
			||||||
            deps=annot["deps"],
 | 
					            deps=annot["deps"],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
 | 
					        gold = {"heads": annot["heads"], "deps": annot["deps"]}
 | 
				
			||||||
        doc[0].dep_ = "compound"
 | 
					        doc[0].dep_ = "compound"
 | 
				
			||||||
        scorer.score((doc, gold))
 | 
					        example = Example.from_dict(doc, gold)
 | 
				
			||||||
 | 
					        scorer.score(example)
 | 
				
			||||||
    results = scorer.scores
 | 
					    results = scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert results["uas"] == 100
 | 
					    assert results["uas"] == 100
 | 
				
			||||||
| 
						 | 
					@ -135,8 +139,8 @@ def test_ner_per_type(en_vocab):
 | 
				
			||||||
            words=input_.split(" "),
 | 
					            words=input_.split(" "),
 | 
				
			||||||
            ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
 | 
					            ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        ex = Example(doc=doc)
 | 
					        entities = biluo_tags_from_offsets(doc, annot["entities"])
 | 
				
			||||||
        ex.set_token_annotation(entities=annot["entities"])
 | 
					        ex = Example.from_dict(doc, {"entities": entities})
 | 
				
			||||||
        scorer.score(ex)
 | 
					        scorer.score(ex)
 | 
				
			||||||
    results = scorer.scores
 | 
					    results = scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -156,8 +160,8 @@ def test_ner_per_type(en_vocab):
 | 
				
			||||||
            words=input_.split(" "),
 | 
					            words=input_.split(" "),
 | 
				
			||||||
            ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
 | 
					            ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        ex = Example(doc=doc)
 | 
					        entities = biluo_tags_from_offsets(doc, annot["entities"])
 | 
				
			||||||
        ex.set_token_annotation(entities=annot["entities"])
 | 
					        ex = Example.from_dict(doc, {"entities": entities})
 | 
				
			||||||
        scorer.score(ex)
 | 
					        scorer.score(ex)
 | 
				
			||||||
    results = scorer.scores
 | 
					    results = scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -181,13 +185,13 @@ def test_ner_per_type(en_vocab):
 | 
				
			||||||
def test_tag_score(tagged_doc):
 | 
					def test_tag_score(tagged_doc):
 | 
				
			||||||
    # Gold and Doc are identical
 | 
					    # Gold and Doc are identical
 | 
				
			||||||
    scorer = Scorer()
 | 
					    scorer = Scorer()
 | 
				
			||||||
    gold = GoldParse(
 | 
					    gold = {
 | 
				
			||||||
        tagged_doc,
 | 
					        "tags": [t.tag_ for t in tagged_doc],
 | 
				
			||||||
        tags=[t.tag_ for t in tagged_doc],
 | 
					        "pos": [t.pos_ for t in tagged_doc],
 | 
				
			||||||
        pos=[t.pos_ for t in tagged_doc],
 | 
					        "morphs": [t.morph_ for t in tagged_doc],
 | 
				
			||||||
        morphs=[t.morph_ for t in tagged_doc],
 | 
					    }
 | 
				
			||||||
    )
 | 
					    example = Example.from_dict(tagged_doc, gold)
 | 
				
			||||||
    scorer.score((tagged_doc, gold))
 | 
					    scorer.score(example)
 | 
				
			||||||
    results = scorer.scores
 | 
					    results = scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert results["tags_acc"] == 100
 | 
					    assert results["tags_acc"] == 100
 | 
				
			||||||
| 
						 | 
					@ -204,8 +208,9 @@ def test_tag_score(tagged_doc):
 | 
				
			||||||
    morphs = [t.morph_ for t in tagged_doc]
 | 
					    morphs = [t.morph_ for t in tagged_doc]
 | 
				
			||||||
    morphs[1] = "Number=sing"
 | 
					    morphs[1] = "Number=sing"
 | 
				
			||||||
    morphs[2] = "Number=plur"
 | 
					    morphs[2] = "Number=plur"
 | 
				
			||||||
    gold = GoldParse(tagged_doc, tags=tags, pos=pos, morphs=morphs)
 | 
					    gold = {"tags": tags, "pos": pos, "morphs": morphs}
 | 
				
			||||||
    scorer.score((tagged_doc, gold))
 | 
					    example = Example.from_dict(tagged_doc, gold)
 | 
				
			||||||
 | 
					    scorer.score(example)
 | 
				
			||||||
    results = scorer.scores
 | 
					    results = scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert results["tags_acc"] == 90
 | 
					    assert results["tags_acc"] == 90
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,4 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from spacy.gold import Example
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .util import get_random_doc
 | 
					from .util import get_random_doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,19 +24,16 @@ from spacy.util import minibatch_by_words
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def test_util_minibatch(doc_sizes, expected_batches):
 | 
					def test_util_minibatch(doc_sizes, expected_batches):
 | 
				
			||||||
    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
 | 
					    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
 | 
				
			||||||
    examples = [Example(doc=doc) for doc in docs]
 | 
					 | 
				
			||||||
    tol = 0.2
 | 
					    tol = 0.2
 | 
				
			||||||
    batch_size = 1000
 | 
					    batch_size = 1000
 | 
				
			||||||
    batches = list(
 | 
					    batches = list(
 | 
				
			||||||
        minibatch_by_words(
 | 
					        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
 | 
				
			||||||
            examples=examples, size=batch_size, tolerance=tol, discard_oversize=True
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    assert [len(batch) for batch in batches] == expected_batches
 | 
					    assert [len(batch) for batch in batches] == expected_batches
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    max_size = batch_size + batch_size * tol
 | 
					    max_size = batch_size + batch_size * tol
 | 
				
			||||||
    for batch in batches:
 | 
					    for batch in batches:
 | 
				
			||||||
        assert sum([len(example.doc) for example in batch]) < max_size
 | 
					        assert sum([len(doc) for doc in batch]) < max_size
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
| 
						 | 
					@ -54,12 +50,9 @@ def test_util_minibatch(doc_sizes, expected_batches):
 | 
				
			||||||
def test_util_minibatch_oversize(doc_sizes, expected_batches):
 | 
					def test_util_minibatch_oversize(doc_sizes, expected_batches):
 | 
				
			||||||
    """ Test that oversized documents are returned in their own batch"""
 | 
					    """ Test that oversized documents are returned in their own batch"""
 | 
				
			||||||
    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
 | 
					    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
 | 
				
			||||||
    examples = [Example(doc=doc) for doc in docs]
 | 
					 | 
				
			||||||
    tol = 0.2
 | 
					    tol = 0.2
 | 
				
			||||||
    batch_size = 1000
 | 
					    batch_size = 1000
 | 
				
			||||||
    batches = list(
 | 
					    batches = list(
 | 
				
			||||||
        minibatch_by_words(
 | 
					        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
 | 
				
			||||||
            examples=examples, size=batch_size, tolerance=tol, discard_oversize=False
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    assert [len(batch) for batch in batches] == expected_batches
 | 
					    assert [len(batch) for batch in batches] == expected_batches
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,14 @@
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import tempfile
 | 
					import tempfile
 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
import contextlib
 | 
					import contextlib
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import Errors
 | 
					from spacy import Errors
 | 
				
			||||||
from spacy.tokens import Doc, Span
 | 
					from spacy.tokens import Doc, Span
 | 
				
			||||||
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
 | 
					from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
from spacy.util import make_tempdir
 | 
					from spacy.util import make_tempdir  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@contextlib.contextmanager
 | 
					@contextlib.contextmanager
 | 
				
			||||||
| 
						 | 
					@ -20,15 +19,23 @@ def make_tempfile(mode="r"):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_doc(
 | 
					def get_doc(
 | 
				
			||||||
    vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None
 | 
					    vocab,
 | 
				
			||||||
 | 
					    words=[],
 | 
				
			||||||
 | 
					    pos=None,
 | 
				
			||||||
 | 
					    heads=None,
 | 
				
			||||||
 | 
					    deps=None,
 | 
				
			||||||
 | 
					    tags=None,
 | 
				
			||||||
 | 
					    ents=None,
 | 
				
			||||||
 | 
					    lemmas=None,
 | 
				
			||||||
 | 
					    morphs=None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """Create Doc object from given vocab, words and annotations."""
 | 
					    """Create Doc object from given vocab, words and annotations."""
 | 
				
			||||||
    if deps and not heads:
 | 
					    if deps and not heads:
 | 
				
			||||||
        heads = [0] * len(deps)
 | 
					        heads = [0] * len(deps)
 | 
				
			||||||
    headings = []
 | 
					    headings = []
 | 
				
			||||||
    values = []
 | 
					    values = []
 | 
				
			||||||
    annotations = [pos, heads, deps, lemmas, tags]
 | 
					    annotations = [pos, heads, deps, lemmas, tags, morphs]
 | 
				
			||||||
    possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
 | 
					    possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
 | 
				
			||||||
    for a, annot in enumerate(annotations):
 | 
					    for a, annot in enumerate(annotations):
 | 
				
			||||||
        if annot is not None:
 | 
					        if annot is not None:
 | 
				
			||||||
            if len(annot) != len(words):
 | 
					            if len(annot) != len(words):
 | 
				
			||||||
| 
						 | 
					@ -54,6 +61,13 @@ def get_doc(
 | 
				
			||||||
                            attrs[i] = heads[i]
 | 
					                            attrs[i] = heads[i]
 | 
				
			||||||
                        else:
 | 
					                        else:
 | 
				
			||||||
                            attrs[i, j] = heads[i]
 | 
					                            attrs[i, j] = heads[i]
 | 
				
			||||||
 | 
					                elif annot is morphs:
 | 
				
			||||||
 | 
					                    for i in range(len(words)):
 | 
				
			||||||
 | 
					                        morph_key = vocab.morphology.add(morphs[i])
 | 
				
			||||||
 | 
					                        if attrs.ndim == 1:
 | 
				
			||||||
 | 
					                            attrs[i] = morph_key
 | 
				
			||||||
 | 
					                        else:
 | 
				
			||||||
 | 
					                            attrs[i, j] = morph_key
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    for i in range(len(words)):
 | 
					                    for i in range(len(words)):
 | 
				
			||||||
                        if attrs.ndim == 1:
 | 
					                        if attrs.ndim == 1:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -218,7 +218,7 @@ cdef class Tokenizer:
 | 
				
			||||||
            doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
 | 
					            doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pipe(self, texts, batch_size=1000, n_threads=-1, as_example=False):
 | 
					    def pipe(self, texts, batch_size=1000, n_threads=-1):
 | 
				
			||||||
        """Tokenize a stream of texts.
 | 
					        """Tokenize a stream of texts.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        texts: A sequence of unicode texts.
 | 
					        texts: A sequence of unicode texts.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,6 +9,9 @@ from ..attrs import SPACY, ORTH, intify_attr
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "LEMMA", "MORPH")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DocBin(object):
 | 
					class DocBin(object):
 | 
				
			||||||
    """Pack Doc objects for binary serialization.
 | 
					    """Pack Doc objects for binary serialization.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -39,7 +42,7 @@ class DocBin(object):
 | 
				
			||||||
    document from the DocBin.
 | 
					    document from the DocBin.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, attrs=None, store_user_data=False):
 | 
					    def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]):
 | 
				
			||||||
        """Create a DocBin object to hold serialized annotations.
 | 
					        """Create a DocBin object to hold serialized annotations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
 | 
					        attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
 | 
				
			||||||
| 
						 | 
					@ -49,7 +52,6 @@ class DocBin(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/docbin#init
 | 
					        DOCS: https://spacy.io/api/docbin#init
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        attrs = attrs or []
 | 
					 | 
				
			||||||
        attrs = sorted([intify_attr(attr) for attr in attrs])
 | 
					        attrs = sorted([intify_attr(attr) for attr in attrs])
 | 
				
			||||||
        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
 | 
					        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
 | 
				
			||||||
        self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
 | 
					        self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
 | 
				
			||||||
| 
						 | 
					@ -59,6 +61,8 @@ class DocBin(object):
 | 
				
			||||||
        self.user_data = []
 | 
					        self.user_data = []
 | 
				
			||||||
        self.strings = set()
 | 
					        self.strings = set()
 | 
				
			||||||
        self.store_user_data = store_user_data
 | 
					        self.store_user_data = store_user_data
 | 
				
			||||||
 | 
					        for doc in docs:
 | 
				
			||||||
 | 
					            self.add(doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
        """RETURNS: The number of Doc objects added to the DocBin."""
 | 
					        """RETURNS: The number of Doc objects added to the DocBin."""
 | 
				
			||||||
| 
						 | 
					@ -79,7 +83,12 @@ class DocBin(object):
 | 
				
			||||||
        assert array.shape[0] == spaces.shape[0]  # this should never happen
 | 
					        assert array.shape[0] == spaces.shape[0]  # this should never happen
 | 
				
			||||||
        spaces = spaces.reshape((spaces.shape[0], 1))
 | 
					        spaces = spaces.reshape((spaces.shape[0], 1))
 | 
				
			||||||
        self.spaces.append(numpy.asarray(spaces, dtype=bool))
 | 
					        self.spaces.append(numpy.asarray(spaces, dtype=bool))
 | 
				
			||||||
        self.strings.update(w.text for w in doc)
 | 
					        for token in doc:
 | 
				
			||||||
 | 
					            self.strings.add(token.text)
 | 
				
			||||||
 | 
					            self.strings.add(token.tag_)
 | 
				
			||||||
 | 
					            self.strings.add(token.lemma_)
 | 
				
			||||||
 | 
					            self.strings.add(token.dep_)
 | 
				
			||||||
 | 
					            self.strings.add(token.ent_type_)
 | 
				
			||||||
        self.cats.append(doc.cats)
 | 
					        self.cats.append(doc.cats)
 | 
				
			||||||
        if self.store_user_data:
 | 
					        if self.store_user_data:
 | 
				
			||||||
            self.user_data.append(srsly.msgpack_dumps(doc.user_data))
 | 
					            self.user_data.append(srsly.msgpack_dumps(doc.user_data))
 | 
				
			||||||
| 
						 | 
					@ -98,8 +107,7 @@ class DocBin(object):
 | 
				
			||||||
        for i in range(len(self.tokens)):
 | 
					        for i in range(len(self.tokens)):
 | 
				
			||||||
            tokens = self.tokens[i]
 | 
					            tokens = self.tokens[i]
 | 
				
			||||||
            spaces = self.spaces[i]
 | 
					            spaces = self.spaces[i]
 | 
				
			||||||
            words = [vocab.strings[orth] for orth in tokens[:, orth_col]]
 | 
					            doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
 | 
				
			||||||
            doc = Doc(vocab, words=words, spaces=spaces)
 | 
					 | 
				
			||||||
            doc = doc.from_array(self.attrs, tokens)
 | 
					            doc = doc.from_array(self.attrs, tokens)
 | 
				
			||||||
            doc.cats = self.cats[i]
 | 
					            doc.cats = self.cats[i]
 | 
				
			||||||
            if self.store_user_data:
 | 
					            if self.store_user_data:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,7 @@ cimport cython
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
from libc.string cimport memcpy, memset
 | 
					from libc.string cimport memcpy, memset
 | 
				
			||||||
from libc.math cimport sqrt
 | 
					from libc.math cimport sqrt
 | 
				
			||||||
 | 
					from libc.stdint cimport int32_t, uint64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from collections import Counter
 | 
					from collections import Counter
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					@ -12,13 +13,14 @@ import srsly
 | 
				
			||||||
from thinc.api import get_array_module
 | 
					from thinc.api import get_array_module
 | 
				
			||||||
from thinc.util import copy_array
 | 
					from thinc.util import copy_array
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					import copy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .span cimport Span
 | 
					from .span cimport Span
 | 
				
			||||||
from .token cimport Token
 | 
					from .token cimport Token
 | 
				
			||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
 | 
					from ..lexeme cimport Lexeme, EMPTY_LEXEME
 | 
				
			||||||
from ..typedefs cimport attr_t, flags_t
 | 
					from ..typedefs cimport attr_t, flags_t
 | 
				
			||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
 | 
					from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
 | 
				
			||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
 | 
					from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
 | 
				
			||||||
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
 | 
					from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
 | 
				
			||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 | 
					from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -52,6 +54,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
 | 
				
			||||||
        return token.pos
 | 
					        return token.pos
 | 
				
			||||||
    elif feat_name == TAG:
 | 
					    elif feat_name == TAG:
 | 
				
			||||||
        return token.tag
 | 
					        return token.tag
 | 
				
			||||||
 | 
					    elif feat_name == MORPH:
 | 
				
			||||||
 | 
					        return token.morph
 | 
				
			||||||
    elif feat_name == DEP:
 | 
					    elif feat_name == DEP:
 | 
				
			||||||
        return token.dep
 | 
					        return token.dep
 | 
				
			||||||
    elif feat_name == HEAD:
 | 
					    elif feat_name == HEAD:
 | 
				
			||||||
| 
						 | 
					@ -184,7 +188,7 @@ cdef class Doc:
 | 
				
			||||||
        DOCS: https://spacy.io/api/doc#init
 | 
					        DOCS: https://spacy.io/api/doc#init
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
        size = 20
 | 
					        size = max(20, (len(words) if words is not None else 0))
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
 | 
					        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
 | 
				
			||||||
        # However, we need to remember the true starting places, so that we can
 | 
					        # However, we need to remember the true starting places, so that we can
 | 
				
			||||||
| 
						 | 
					@ -209,7 +213,6 @@ cdef class Doc:
 | 
				
			||||||
        self.user_data = {} if user_data is None else user_data
 | 
					        self.user_data = {} if user_data is None else user_data
 | 
				
			||||||
        self._vector = None
 | 
					        self._vector = None
 | 
				
			||||||
        self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
 | 
					        self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
 | 
				
			||||||
        cdef unicode orth
 | 
					 | 
				
			||||||
        cdef bint has_space
 | 
					        cdef bint has_space
 | 
				
			||||||
        if orths_and_spaces is None and words is not None:
 | 
					        if orths_and_spaces is None and words is not None:
 | 
				
			||||||
            if spaces is None:
 | 
					            if spaces is None:
 | 
				
			||||||
| 
						 | 
					@ -217,19 +220,22 @@ cdef class Doc:
 | 
				
			||||||
            elif len(spaces) != len(words):
 | 
					            elif len(spaces) != len(words):
 | 
				
			||||||
                raise ValueError(Errors.E027)
 | 
					                raise ValueError(Errors.E027)
 | 
				
			||||||
            orths_and_spaces = zip(words, spaces)
 | 
					            orths_and_spaces = zip(words, spaces)
 | 
				
			||||||
 | 
					        cdef const LexemeC* lexeme
 | 
				
			||||||
        if orths_and_spaces is not None:
 | 
					        if orths_and_spaces is not None:
 | 
				
			||||||
 | 
					            orths_and_spaces = list(orths_and_spaces)
 | 
				
			||||||
            for orth_space in orths_and_spaces:
 | 
					            for orth_space in orths_and_spaces:
 | 
				
			||||||
                if isinstance(orth_space, unicode):
 | 
					                if isinstance(orth_space, unicode):
 | 
				
			||||||
                    orth = orth_space
 | 
					                    lexeme = self.vocab.get(self.mem, orth_space)
 | 
				
			||||||
                    has_space = True
 | 
					                    has_space = True
 | 
				
			||||||
                elif isinstance(orth_space, bytes):
 | 
					                elif isinstance(orth_space, bytes):
 | 
				
			||||||
                    raise ValueError(Errors.E028.format(value=orth_space))
 | 
					                    raise ValueError(Errors.E028.format(value=orth_space))
 | 
				
			||||||
 | 
					                elif isinstance(orth_space[0], unicode):
 | 
				
			||||||
 | 
					                    lexeme = self.vocab.get(self.mem, orth_space[0])
 | 
				
			||||||
 | 
					                    has_space = orth_space[1]
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    orth, has_space = orth_space
 | 
					                    lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
 | 
				
			||||||
                # Note that we pass self.mem here --- we have ownership, if LexemeC
 | 
					                    has_space = orth_space[1]
 | 
				
			||||||
                # must be created.
 | 
					                self.push_back(lexeme, has_space)
 | 
				
			||||||
                self.push_back(
 | 
					 | 
				
			||||||
                    <const LexemeC*>self.vocab.get(self.mem, orth), has_space)
 | 
					 | 
				
			||||||
        # Tough to decide on policy for this. Is an empty doc tagged and parsed?
 | 
					        # Tough to decide on policy for this. Is an empty doc tagged and parsed?
 | 
				
			||||||
        # There's no information we'd like to add to it, so I guess so?
 | 
					        # There's no information we'd like to add to it, so I guess so?
 | 
				
			||||||
        if self.length == 0:
 | 
					        if self.length == 0:
 | 
				
			||||||
| 
						 | 
					@ -517,7 +523,8 @@ cdef class Doc:
 | 
				
			||||||
                    if start == -1:
 | 
					                    if start == -1:
 | 
				
			||||||
                        seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]]
 | 
					                        seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]]
 | 
				
			||||||
                        raise ValueError(Errors.E093.format(seq=" ".join(seq)))
 | 
					                        raise ValueError(Errors.E093.format(seq=" ".join(seq)))
 | 
				
			||||||
                elif token.ent_iob == 2 or token.ent_iob == 0:
 | 
					                elif token.ent_iob == 2 or token.ent_iob == 0 or \
 | 
				
			||||||
 | 
					                        (token.ent_iob == 3 and token.ent_type == 0):
 | 
				
			||||||
                    if start != -1:
 | 
					                    if start != -1:
 | 
				
			||||||
                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
 | 
					                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
 | 
				
			||||||
                    start = -1
 | 
					                    start = -1
 | 
				
			||||||
| 
						 | 
					@ -531,6 +538,8 @@ cdef class Doc:
 | 
				
			||||||
                    kb_id = token.ent_kb_id
 | 
					                    kb_id = token.ent_kb_id
 | 
				
			||||||
            if start != -1:
 | 
					            if start != -1:
 | 
				
			||||||
                output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
 | 
					                output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
 | 
				
			||||||
 | 
					            # remove empty-label spans
 | 
				
			||||||
 | 
					            output = [o for o in output if o.label_ != ""]
 | 
				
			||||||
            return tuple(output)
 | 
					            return tuple(output)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def __set__(self, ents):
 | 
					        def __set__(self, ents):
 | 
				
			||||||
| 
						 | 
					@ -699,8 +708,12 @@ cdef class Doc:
 | 
				
			||||||
            # Handle inputs like doc.to_array(ORTH)
 | 
					            # Handle inputs like doc.to_array(ORTH)
 | 
				
			||||||
            py_attr_ids = [py_attr_ids]
 | 
					            py_attr_ids = [py_attr_ids]
 | 
				
			||||||
        # Allow strings, e.g. 'lemma' or 'LEMMA'
 | 
					        # Allow strings, e.g. 'lemma' or 'LEMMA'
 | 
				
			||||||
        py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
 | 
					        try:
 | 
				
			||||||
 | 
					            py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
 | 
				
			||||||
                       for id_ in py_attr_ids]
 | 
					                       for id_ in py_attr_ids]
 | 
				
			||||||
 | 
					        except KeyError as msg:
 | 
				
			||||||
 | 
					            keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
 | 
				
			||||||
 | 
					            raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys))
 | 
				
			||||||
        # Make an array from the attributes --- otherwise our inner loop is
 | 
					        # Make an array from the attributes --- otherwise our inner loop is
 | 
				
			||||||
        # Python dict iteration.
 | 
					        # Python dict iteration.
 | 
				
			||||||
        cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
 | 
					        cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
 | 
				
			||||||
| 
						 | 
					@ -747,6 +760,8 @@ cdef class Doc:
 | 
				
			||||||
            return dict(counts)
 | 
					            return dict(counts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _realloc(self, new_size):
 | 
					    def _realloc(self, new_size):
 | 
				
			||||||
 | 
					        if new_size < self.max_length:
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
        self.max_length = new_size
 | 
					        self.max_length = new_size
 | 
				
			||||||
        n = new_size + (PADDING * 2)
 | 
					        n = new_size + (PADDING * 2)
 | 
				
			||||||
        # What we're storing is a "padded" array. We've jumped forward PADDING
 | 
					        # What we're storing is a "padded" array. We've jumped forward PADDING
 | 
				
			||||||
| 
						 | 
					@ -795,10 +810,14 @@ cdef class Doc:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if SENT_START in attrs and HEAD in attrs:
 | 
					        if SENT_START in attrs and HEAD in attrs:
 | 
				
			||||||
            raise ValueError(Errors.E032)
 | 
					            raise ValueError(Errors.E032)
 | 
				
			||||||
        cdef int i, col, abs_head_index
 | 
					        cdef int i, col
 | 
				
			||||||
 | 
					        cdef int32_t abs_head_index
 | 
				
			||||||
        cdef attr_id_t attr_id
 | 
					        cdef attr_id_t attr_id
 | 
				
			||||||
        cdef TokenC* tokens = self.c
 | 
					        cdef TokenC* tokens = self.c
 | 
				
			||||||
        cdef int length = len(array)
 | 
					        cdef int length = len(array)
 | 
				
			||||||
 | 
					        if length != len(self):
 | 
				
			||||||
 | 
					            raise ValueError("Cannot set array values longer than the document.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Get set up for fast loading
 | 
					        # Get set up for fast loading
 | 
				
			||||||
        cdef Pool mem = Pool()
 | 
					        cdef Pool mem = Pool()
 | 
				
			||||||
        cdef int n_attrs = len(attrs)
 | 
					        cdef int n_attrs = len(attrs)
 | 
				
			||||||
| 
						 | 
					@ -809,26 +828,52 @@ cdef class Doc:
 | 
				
			||||||
            attr_ids[i] = attr_id
 | 
					            attr_ids[i] = attr_id
 | 
				
			||||||
        if len(array.shape) == 1:
 | 
					        if len(array.shape) == 1:
 | 
				
			||||||
            array = array.reshape((array.size, 1))
 | 
					            array = array.reshape((array.size, 1))
 | 
				
			||||||
 | 
					        cdef np.ndarray transposed_array = numpy.ascontiguousarray(array.T)
 | 
				
			||||||
 | 
					        values = <const uint64_t*>transposed_array.data
 | 
				
			||||||
 | 
					        stride = transposed_array.shape[1]
 | 
				
			||||||
        # Check that all heads are within the document bounds
 | 
					        # Check that all heads are within the document bounds
 | 
				
			||||||
        if HEAD in attrs:
 | 
					        if HEAD in attrs:
 | 
				
			||||||
            col = attrs.index(HEAD)
 | 
					            col = attrs.index(HEAD)
 | 
				
			||||||
            for i in range(length):
 | 
					            for i in range(length):
 | 
				
			||||||
                # cast index to signed int
 | 
					                # cast index to signed int
 | 
				
			||||||
                abs_head_index = numpy.int32(array[i, col]) + i
 | 
					                abs_head_index = <int32_t>values[col * stride + i]
 | 
				
			||||||
 | 
					                abs_head_index += i
 | 
				
			||||||
                if abs_head_index < 0 or abs_head_index >= length:
 | 
					                if abs_head_index < 0 or abs_head_index >= length:
 | 
				
			||||||
                    raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col])))
 | 
					                    raise ValueError(
 | 
				
			||||||
 | 
					                        Errors.E190.format(
 | 
				
			||||||
 | 
					                            index=i,
 | 
				
			||||||
 | 
					                            value=array[i, col],
 | 
				
			||||||
 | 
					                            rel_head_index=abs_head_index-i
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
        # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
 | 
					        # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
 | 
				
			||||||
        if TAG in attrs:
 | 
					        if TAG in attrs:
 | 
				
			||||||
            col = attrs.index(TAG)
 | 
					            col = attrs.index(TAG)
 | 
				
			||||||
            for i in range(length):
 | 
					            for i in range(length):
 | 
				
			||||||
                if array[i, col] != 0:
 | 
					                value = values[col * stride + i]
 | 
				
			||||||
                    self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
 | 
					                if value != 0:
 | 
				
			||||||
 | 
					                    self.vocab.morphology.assign_tag(&tokens[i], value)
 | 
				
			||||||
 | 
					        # Verify ENT_IOB are proper integers
 | 
				
			||||||
 | 
					        if ENT_IOB in attrs:
 | 
				
			||||||
 | 
					            iob_strings = Token.iob_strings()
 | 
				
			||||||
 | 
					            col = attrs.index(ENT_IOB)
 | 
				
			||||||
 | 
					            n_iob_strings = len(iob_strings)
 | 
				
			||||||
 | 
					            for i in range(length):
 | 
				
			||||||
 | 
					                value = values[col * stride + i]
 | 
				
			||||||
 | 
					                if value < 0 or value >= n_iob_strings:
 | 
				
			||||||
 | 
					                    raise ValueError(
 | 
				
			||||||
 | 
					                        Errors.E982.format(
 | 
				
			||||||
 | 
					                            values=iob_strings,
 | 
				
			||||||
 | 
					                            value=value
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
        # Now load the data
 | 
					        # Now load the data
 | 
				
			||||||
        for i in range(length):
 | 
					        for i in range(length):
 | 
				
			||||||
            token = &self.c[i]
 | 
					            token = &self.c[i]
 | 
				
			||||||
            for j in range(n_attrs):
 | 
					            for j in range(n_attrs):
 | 
				
			||||||
                if attr_ids[j] != TAG:
 | 
					                if attr_ids[j] != TAG:
 | 
				
			||||||
                    Token.set_struct_attr(token, attr_ids[j], array[i, j])
 | 
					                    value = values[j * stride + i]
 | 
				
			||||||
 | 
					                    Token.set_struct_attr(token, attr_ids[j], value)
 | 
				
			||||||
        # Set flags
 | 
					        # Set flags
 | 
				
			||||||
        self.is_parsed = bool(self.is_parsed or HEAD in attrs)
 | 
					        self.is_parsed = bool(self.is_parsed or HEAD in attrs)
 | 
				
			||||||
        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
 | 
					        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
 | 
				
			||||||
| 
						 | 
					@ -849,6 +894,28 @@ cdef class Doc:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
 | 
					        return numpy.asarray(_get_lca_matrix(self, 0, len(self)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def copy(self):
 | 
				
			||||||
 | 
					        cdef Doc other = Doc(self.vocab)
 | 
				
			||||||
 | 
					        other._vector = copy.deepcopy(self._vector)
 | 
				
			||||||
 | 
					        other._vector_norm = copy.deepcopy(self._vector_norm)
 | 
				
			||||||
 | 
					        other.tensor = copy.deepcopy(self.tensor)
 | 
				
			||||||
 | 
					        other.cats = copy.deepcopy(self.cats)
 | 
				
			||||||
 | 
					        other.user_data = copy.deepcopy(self.user_data)
 | 
				
			||||||
 | 
					        other.is_tagged = self.is_tagged
 | 
				
			||||||
 | 
					        other.is_parsed = self.is_parsed
 | 
				
			||||||
 | 
					        other.is_morphed = self.is_morphed
 | 
				
			||||||
 | 
					        other.sentiment = self.sentiment
 | 
				
			||||||
 | 
					        other.user_hooks = dict(self.user_hooks)
 | 
				
			||||||
 | 
					        other.user_token_hooks = dict(self.user_token_hooks)
 | 
				
			||||||
 | 
					        other.user_span_hooks = dict(self.user_span_hooks)
 | 
				
			||||||
 | 
					        other.length = self.length
 | 
				
			||||||
 | 
					        other.max_length = self.max_length
 | 
				
			||||||
 | 
					        buff_size = other.max_length + (PADDING*2)
 | 
				
			||||||
 | 
					        tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))
 | 
				
			||||||
 | 
					        memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC))
 | 
				
			||||||
 | 
					        other.c = &tokens[PADDING]
 | 
				
			||||||
 | 
					        return other
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_disk(self, path, **kwargs):
 | 
					    def to_disk(self, path, **kwargs):
 | 
				
			||||||
        """Save the current state to a directory.
 | 
					        """Save the current state to a directory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -881,6 +948,32 @@ cdef class Doc:
 | 
				
			||||||
    def to_bytes(self, exclude=tuple(), **kwargs):
 | 
					    def to_bytes(self, exclude=tuple(), **kwargs):
 | 
				
			||||||
        """Serialize, i.e. export the document contents to a binary string.
 | 
					        """Serialize, i.e. export the document contents to a binary string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        exclude (list): String names of serialization fields to exclude.
 | 
				
			||||||
 | 
					        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
 | 
				
			||||||
 | 
					            all annotations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        DOCS: https://spacy.io/api/doc#to_bytes
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        return srsly.msgpack_dumps(self.to_dict(exclude=exclude, **kwargs))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
 | 
				
			||||||
 | 
					        """Deserialize, i.e. import the document contents from a binary string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        data (bytes): The string to load from.
 | 
				
			||||||
 | 
					        exclude (list): String names of serialization fields to exclude.
 | 
				
			||||||
 | 
					        RETURNS (Doc): Itself.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        DOCS: https://spacy.io/api/doc#from_bytes
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        return self.from_dict(
 | 
				
			||||||
 | 
					            srsly.msgpack_loads(bytes_data),
 | 
				
			||||||
 | 
					            exclude=exclude,
 | 
				
			||||||
 | 
					            **kwargs
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_dict(self, exclude=tuple(), **kwargs):
 | 
				
			||||||
 | 
					        """Export the document contents to a dictionary for serialization.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        exclude (list): String names of serialization fields to exclude.
 | 
					        exclude (list): String names of serialization fields to exclude.
 | 
				
			||||||
        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
 | 
					        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
 | 
				
			||||||
            all annotations.
 | 
					            all annotations.
 | 
				
			||||||
| 
						 | 
					@ -917,9 +1010,9 @@ cdef class Doc:
 | 
				
			||||||
                serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
 | 
					                serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
 | 
				
			||||||
            if "user_data_values" not in exclude:
 | 
					            if "user_data_values" not in exclude:
 | 
				
			||||||
                serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
 | 
					                serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
 | 
				
			||||||
        return util.to_bytes(serializers, exclude)
 | 
					        return util.to_dict(serializers, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
 | 
					    def from_dict(self, msg, exclude=tuple(), **kwargs):
 | 
				
			||||||
        """Deserialize, i.e. import the document contents from a binary string.
 | 
					        """Deserialize, i.e. import the document contents from a binary string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        data (bytes): The string to load from.
 | 
					        data (bytes): The string to load from.
 | 
				
			||||||
| 
						 | 
					@ -943,7 +1036,6 @@ cdef class Doc:
 | 
				
			||||||
        for key in kwargs:
 | 
					        for key in kwargs:
 | 
				
			||||||
            if key in deserializers or key in ("user_data",):
 | 
					            if key in deserializers or key in ("user_data",):
 | 
				
			||||||
                raise ValueError(Errors.E128.format(arg=key))
 | 
					                raise ValueError(Errors.E128.format(arg=key))
 | 
				
			||||||
        msg = util.from_bytes(bytes_data, deserializers, exclude)
 | 
					 | 
				
			||||||
        # Msgpack doesn't distinguish between lists and tuples, which is
 | 
					        # Msgpack doesn't distinguish between lists and tuples, which is
 | 
				
			||||||
        # vexing for user data. As a best guess, we *know* that within
 | 
					        # vexing for user data. As a best guess, we *know* that within
 | 
				
			||||||
        # keys, we must have tuples. In values we just have to hope
 | 
					        # keys, we must have tuples. In values we just have to hope
 | 
				
			||||||
| 
						 | 
					@ -975,6 +1067,7 @@ cdef class Doc:
 | 
				
			||||||
        self.from_array(msg["array_head"][2:], attrs[:, 2:])
 | 
					        self.from_array(msg["array_head"][2:], attrs[:, 2:])
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def extend_tensor(self, tensor):
 | 
					    def extend_tensor(self, tensor):
 | 
				
			||||||
        """Concatenate a new tensor onto the doc.tensor object.
 | 
					        """Concatenate a new tensor onto the doc.tensor object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -778,6 +778,10 @@ cdef class Token:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return self.c.ent_iob
 | 
					        return self.c.ent_iob
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def iob_strings(cls):
 | 
				
			||||||
 | 
					        return ("", "I", "O", "B")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def ent_iob_(self):
 | 
					    def ent_iob_(self):
 | 
				
			||||||
        """IOB code of named entity tag. "B" means the token begins an entity,
 | 
					        """IOB code of named entity tag. "B" means the token begins an entity,
 | 
				
			||||||
| 
						 | 
					@ -787,8 +791,7 @@ cdef class Token:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        RETURNS (str): IOB code of named entity tag.
 | 
					        RETURNS (str): IOB code of named entity tag.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        iob_strings = ("", "I", "O", "B")
 | 
					        return self.iob_strings()[self.c.ent_iob]
 | 
				
			||||||
        return iob_strings[self.c.ent_iob]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property ent_id:
 | 
					    property ent_id:
 | 
				
			||||||
        """RETURNS (uint64): ID of the entity the token is an instance of,
 | 
					        """RETURNS (uint64): ID of the entity the token is an instance of,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -508,14 +508,6 @@ def get_async(stream, numpy_array):
 | 
				
			||||||
        return array
 | 
					        return array
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def eg2doc(example):
 | 
					 | 
				
			||||||
    """Get a Doc object from an Example (or if it's a Doc, use it directly)"""
 | 
					 | 
				
			||||||
    # Put the import here to avoid circular import problems
 | 
					 | 
				
			||||||
    from .tokens.doc import Doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return example if isinstance(example, Doc) else example.doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def env_opt(name, default=None):
 | 
					def env_opt(name, default=None):
 | 
				
			||||||
    if type(default) is float:
 | 
					    if type(default) is float:
 | 
				
			||||||
        type_convert = float
 | 
					        type_convert = float
 | 
				
			||||||
| 
						 | 
					@ -734,12 +726,13 @@ def decaying(start, stop, decay):
 | 
				
			||||||
        curr -= decay
 | 
					        curr -= decay
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def minibatch_by_words(
 | 
					def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
 | 
				
			||||||
    examples, size, count_words=len, tolerance=0.2, discard_oversize=False
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    """Create minibatches of roughly a given number of words. If any examples
 | 
					    """Create minibatches of roughly a given number of words. If any examples
 | 
				
			||||||
    are longer than the specified batch length, they will appear in a batch by
 | 
					    are longer than the specified batch length, they will appear in a batch by
 | 
				
			||||||
    themselves, or be discarded if discard_oversize=True."""
 | 
					    themselves, or be discarded if discard_oversize=True.
 | 
				
			||||||
 | 
					    The argument 'docs' can be a list of strings, Doc's or Example's. """
 | 
				
			||||||
 | 
					    from .gold import Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if isinstance(size, int):
 | 
					    if isinstance(size, int):
 | 
				
			||||||
        size_ = itertools.repeat(size)
 | 
					        size_ = itertools.repeat(size)
 | 
				
			||||||
    elif isinstance(size, List):
 | 
					    elif isinstance(size, List):
 | 
				
			||||||
| 
						 | 
					@ -754,22 +747,27 @@ def minibatch_by_words(
 | 
				
			||||||
    batch_size = 0
 | 
					    batch_size = 0
 | 
				
			||||||
    overflow_size = 0
 | 
					    overflow_size = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for example in examples:
 | 
					    for doc in docs:
 | 
				
			||||||
        n_words = count_words(example.doc)
 | 
					        if isinstance(doc, Example):
 | 
				
			||||||
 | 
					            n_words = len(doc.reference)
 | 
				
			||||||
 | 
					        elif isinstance(doc, str):
 | 
				
			||||||
 | 
					            n_words = len(doc.split())
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            n_words = len(doc)
 | 
				
			||||||
        # if the current example exceeds the maximum batch size, it is returned separately
 | 
					        # if the current example exceeds the maximum batch size, it is returned separately
 | 
				
			||||||
        # but only if discard_oversize=False.
 | 
					        # but only if discard_oversize=False.
 | 
				
			||||||
        if n_words > target_size + tol_size:
 | 
					        if n_words > target_size + tol_size:
 | 
				
			||||||
            if not discard_oversize:
 | 
					            if not discard_oversize:
 | 
				
			||||||
                yield [example]
 | 
					                yield [doc]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # add the example to the current batch if there's no overflow yet and it still fits
 | 
					        # add the example to the current batch if there's no overflow yet and it still fits
 | 
				
			||||||
        elif overflow_size == 0 and (batch_size + n_words) <= target_size:
 | 
					        elif overflow_size == 0 and (batch_size + n_words) <= target_size:
 | 
				
			||||||
            batch.append(example)
 | 
					            batch.append(doc)
 | 
				
			||||||
            batch_size += n_words
 | 
					            batch_size += n_words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # add the example to the overflow buffer if it fits in the tolerance margin
 | 
					        # add the example to the overflow buffer if it fits in the tolerance margin
 | 
				
			||||||
        elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
 | 
					        elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
 | 
				
			||||||
            overflow.append(example)
 | 
					            overflow.append(doc)
 | 
				
			||||||
            overflow_size += n_words
 | 
					            overflow_size += n_words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # yield the previous batch and start a new one. The new one gets the overflow examples.
 | 
					        # yield the previous batch and start a new one. The new one gets the overflow examples.
 | 
				
			||||||
| 
						 | 
					@ -784,12 +782,12 @@ def minibatch_by_words(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # this example still fits
 | 
					            # this example still fits
 | 
				
			||||||
            if (batch_size + n_words) <= target_size:
 | 
					            if (batch_size + n_words) <= target_size:
 | 
				
			||||||
                batch.append(example)
 | 
					                batch.append(doc)
 | 
				
			||||||
                batch_size += n_words
 | 
					                batch_size += n_words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # this example fits in overflow
 | 
					            # this example fits in overflow
 | 
				
			||||||
            elif (batch_size + n_words) <= (target_size + tol_size):
 | 
					            elif (batch_size + n_words) <= (target_size + tol_size):
 | 
				
			||||||
                overflow.append(example)
 | 
					                overflow.append(doc)
 | 
				
			||||||
                overflow_size += n_words
 | 
					                overflow_size += n_words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # this example does not fit with the previous overflow: start another new batch
 | 
					            # this example does not fit with the previous overflow: start another new batch
 | 
				
			||||||
| 
						 | 
					@ -797,7 +795,7 @@ def minibatch_by_words(
 | 
				
			||||||
                yield batch
 | 
					                yield batch
 | 
				
			||||||
                target_size = next(size_)
 | 
					                target_size = next(size_)
 | 
				
			||||||
                tol_size = target_size * tolerance
 | 
					                tol_size = target_size * tolerance
 | 
				
			||||||
                batch = [example]
 | 
					                batch = [doc]
 | 
				
			||||||
                batch_size = n_words
 | 
					                batch_size = n_words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # yield the final batch
 | 
					    # yield the final batch
 | 
				
			||||||
| 
						 | 
					@ -858,16 +856,23 @@ def filter_spans(spans):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def to_bytes(getters, exclude):
 | 
					def to_bytes(getters, exclude):
 | 
				
			||||||
 | 
					    return srsly.msgpack_dumps(to_dict(getters, exclude))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def from_bytes(bytes_data, setters, exclude):
 | 
				
			||||||
 | 
					    return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def to_dict(getters, exclude):
 | 
				
			||||||
    serialized = {}
 | 
					    serialized = {}
 | 
				
			||||||
    for key, getter in getters.items():
 | 
					    for key, getter in getters.items():
 | 
				
			||||||
        # Split to support file names like meta.json
 | 
					        # Split to support file names like meta.json
 | 
				
			||||||
        if key.split(".")[0] not in exclude:
 | 
					        if key.split(".")[0] not in exclude:
 | 
				
			||||||
            serialized[key] = getter()
 | 
					            serialized[key] = getter()
 | 
				
			||||||
    return srsly.msgpack_dumps(serialized)
 | 
					    return serialized
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def from_bytes(bytes_data, setters, exclude):
 | 
					def from_dict(msg, setters, exclude):
 | 
				
			||||||
    msg = srsly.msgpack_loads(bytes_data)
 | 
					 | 
				
			||||||
    for key, setter in setters.items():
 | 
					    for key, setter in setters.items():
 | 
				
			||||||
        # Split to support file names like meta.json
 | 
					        # Split to support file names like meta.json
 | 
				
			||||||
        if key.split(".")[0] not in exclude and key in msg:
 | 
					        if key.split(".")[0] not in exclude and key in msg:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user