mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge branch 'master' of github.com:honnibal/spaCy
This commit is contained in:
commit
e7ec06cea2
|
@ -5,7 +5,7 @@ from __future__ import unicode_literals
|
|||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import codecs
|
||||
import io
|
||||
import random
|
||||
import time
|
||||
import gzip
|
||||
|
@ -56,12 +56,20 @@ def _parse_line(line):
|
|||
if len(pieces) == 4:
|
||||
word, pos, head_idx, label = pieces
|
||||
head_idx = int(head_idx)
|
||||
elif len(pieces) == 15:
|
||||
id_ = int(pieces[0].split('_')[-1])
|
||||
word = pieces[1]
|
||||
pos = pieces[4]
|
||||
head_idx = int(pieces[8])-1
|
||||
label = pieces[10]
|
||||
else:
|
||||
id_ = int(pieces[0])
|
||||
id_ = int(pieces[0].split('_')[-1])
|
||||
word = pieces[1]
|
||||
pos = pieces[4]
|
||||
head_idx = int(pieces[6])-1
|
||||
label = pieces[7]
|
||||
if head_idx == 0:
|
||||
label = 'ROOT'
|
||||
return word, pos, head_idx, label
|
||||
|
||||
|
||||
|
@ -69,8 +77,8 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
|||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.parser(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=verbose)
|
||||
gold = GoldParse(tokens, annot_tuples, make_projective=False)
|
||||
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
|
||||
|
||||
|
||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||
|
@ -121,12 +129,20 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0
|
|||
print('done')
|
||||
|
||||
|
||||
def main(train_loc, dev_loc, model_dir):
|
||||
with codecs.open(train_loc, 'r', 'utf8') as file_:
|
||||
@plac.annotations(
|
||||
train_loc=("Location of CoNLL 09 formatted training file"),
|
||||
dev_loc=("Location of CoNLL 09 formatted development file"),
|
||||
model_dir=("Location of output model directory"),
|
||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, n_iter=15):
|
||||
with io.open(train_loc, 'r', encoding='utf8') as file_:
|
||||
train_sents = read_conll(file_)
|
||||
train(English, train_sents, model_dir)
|
||||
if not eval_only:
|
||||
train(English, train_sents, model_dir, n_iter=n_iter)
|
||||
nlp = English(data_dir=model_dir)
|
||||
dev_sents = read_conll(open(dev_loc))
|
||||
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
|
||||
scorer = Scorer()
|
||||
for _, sents in dev_sents:
|
||||
for annot_tuples, _ in sents:
|
||||
|
|
|
@ -62,8 +62,10 @@ def main(output_dir):
|
|||
tokens = tokenizer.tokens_from_list(words)
|
||||
tagger.train(tokens, tags)
|
||||
random.shuffle(DATA)
|
||||
tagger.model.end_training(path.join(output_dir, 'pos', 'model'))
|
||||
vocab.strings.dump(path.join(output_dir, 'vocab', 'strings.txt'))
|
||||
tagger.model.end_training()
|
||||
tagger.model.dump(path.join(output_dir, 'pos', 'model'))
|
||||
with io.open(output_dir, 'vocab', 'strings.json') as file_:
|
||||
tagger.vocab.strings.dump(file_)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
{}
|
|
@ -18,8 +18,10 @@
|
|||
"KOUI": {"pos": "SCONJ"},
|
||||
"KOUS": {"pos": "SCONJ"},
|
||||
"NE": {"pos": "PROPN"},
|
||||
"NNE": {"pos": "PROPN"},
|
||||
"NN": {"pos": "NOUN"},
|
||||
"PAV": {"pos": "ADV", "PronType": "Dem"},
|
||||
"PROAV": {"pos": "ADV", "PronType": "Dem"},
|
||||
"PDAT": {"pos": "DET", "PronType": "Dem"},
|
||||
"PDS": {"pos": "PRON", "PronType": "Dem"},
|
||||
"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"},
|
||||
|
@ -52,5 +54,6 @@
|
|||
"VVINF": {"pos": "VERB", "VerbForm": "Inf"},
|
||||
"VVIZU": {"pos": "VERB", "VerbForm": "Inf"},
|
||||
"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
|
||||
"XY": {"pos": "X"}
|
||||
"XY": {"pos": "X"},
|
||||
"SP": {"pos": "SPACE"}
|
||||
}
|
||||
|
|
|
@ -272,22 +272,43 @@ class Language(object):
|
|||
def end_training(self, data_dir=None):
|
||||
if data_dir is None:
|
||||
data_dir = self.data_dir
|
||||
self.parser.model.end_training()
|
||||
self.parser.model.dump(path.join(data_dir, 'deps', 'model'))
|
||||
self.entity.model.end_training()
|
||||
self.entity.model.dump(path.join(data_dir, 'ner', 'model'))
|
||||
self.tagger.model.end_training()
|
||||
self.tagger.model.dump(path.join(data_dir, 'pos', 'model'))
|
||||
if self.parser:
|
||||
self.parser.model.end_training()
|
||||
self.parser.model.dump(path.join(data_dir, 'deps', 'model'))
|
||||
if self.entity:
|
||||
self.entity.model.end_training()
|
||||
self.entity.model.dump(path.join(data_dir, 'ner', 'model'))
|
||||
if self.tagger:
|
||||
self.tagger.model.end_training()
|
||||
self.tagger.model.dump(path.join(data_dir, 'pos', 'model'))
|
||||
|
||||
strings_loc = path.join(data_dir, 'vocab', 'strings.json')
|
||||
with io.open(strings_loc, 'w', encoding='utf8') as file_:
|
||||
self.vocab.strings.dump(file_)
|
||||
self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))
|
||||
|
||||
if self.tagger:
|
||||
tagger_freqs = list(self.tagger.freqs[TAG].items())
|
||||
else:
|
||||
tagger_freqs = []
|
||||
if self.parser:
|
||||
dep_freqs = list(self.parser.moves.freqs[DEP].items())
|
||||
head_freqs = list(self.parser.moves.freqs[HEAD].items())
|
||||
else:
|
||||
dep_freqs = []
|
||||
head_freqs = []
|
||||
if self.entity:
|
||||
entity_iob_freqs = list(self.entity.moves.freqs[ENT_IOB].items())
|
||||
entity_type_freqs = list(self.entity.moves.freqs[ENT_TYPE].items())
|
||||
else:
|
||||
entity_iob_freqs = []
|
||||
entity_type_freqs = []
|
||||
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
|
||||
file_.write(
|
||||
json.dumps([
|
||||
(TAG, list(self.tagger.freqs[TAG].items())),
|
||||
(DEP, list(self.parser.moves.freqs[DEP].items())),
|
||||
(ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
|
||||
(ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
|
||||
(HEAD, list(self.parser.moves.freqs[HEAD].items()))]))
|
||||
(TAG, tagger_freqs),
|
||||
(DEP, dep_freqs),
|
||||
(ENT_IOB, entity_iob_freqs),
|
||||
(ENT_TYPE, entity_type_freqs),
|
||||
(HEAD, head_freqs)
|
||||
]))
|
||||
|
|
|
@ -70,7 +70,7 @@ class Scorer(object):
|
|||
def ents_f(self):
|
||||
return self.ner.fscore * 100
|
||||
|
||||
def score(self, tokens, gold, verbose=False):
|
||||
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
||||
assert len(tokens) == len(gold)
|
||||
|
||||
gold_deps = set()
|
||||
|
@ -78,7 +78,7 @@ class Scorer(object):
|
|||
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
|
||||
for id_, word, tag, head, dep, ner in gold.orig_annot:
|
||||
gold_tags.add((id_, tag))
|
||||
if dep.lower() not in ('p', 'punct'):
|
||||
if dep.lower() not in punct_labels:
|
||||
gold_deps.add((id_, head, dep.lower()))
|
||||
cand_deps = set()
|
||||
cand_tags = set()
|
||||
|
@ -87,12 +87,12 @@ class Scorer(object):
|
|||
continue
|
||||
gold_i = gold.cand_to_gold[token.i]
|
||||
if gold_i is None:
|
||||
if token.dep_.lower() not in ('p', 'punct'):
|
||||
if token.dep_.lower() not in punct_labels:
|
||||
self.tokens.fp += 1
|
||||
else:
|
||||
self.tokens.tp += 1
|
||||
cand_tags.add((gold_i, token.tag_))
|
||||
if token.dep_.lower() not in ('p', 'punct') and token.orth_.strip():
|
||||
if token.dep_.lower() not in punct_labels and token.orth_.strip():
|
||||
gold_head = gold.cand_to_gold[token.head.i]
|
||||
# None is indistinct, so we can't just add it to the set
|
||||
# Multiple (None, None) deps are possible
|
||||
|
|
|
@ -216,6 +216,11 @@ cdef class Tagger:
|
|||
|
||||
def train(self, Doc tokens, object gold_tag_strs):
|
||||
assert len(tokens) == len(gold_tag_strs)
|
||||
for tag in gold_tag_strs:
|
||||
if tag not in self.tag_names:
|
||||
msg = ("Unrecognized gold tag: %s. tag_map.json must contain all"
|
||||
"gold tags, to maintain coarse-grained mapping.")
|
||||
raise ValueError(msg % tag)
|
||||
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
|
||||
cdef int correct = 0
|
||||
cdef Pool mem = Pool()
|
||||
|
|
|
@ -145,6 +145,9 @@ mixin LexemeDistributional
|
|||
+Define("vector")
|
||||
| A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model.
|
||||
|
||||
+Define("has_vector")
|
||||
| A boolean value indicating whether a vector.
|
||||
|
||||
|
||||
mixin Func(type1, type2)
|
||||
#{"λ " + type1 + ", " + type2}
|
||||
|
@ -373,6 +376,17 @@ mixin Func(type1, type2)
|
|||
+Define("whitespace_")
|
||||
| The number of immediate syntactic children following the word in the string.
|
||||
|
||||
details(open=true)
|
||||
summary: h4 Part-of-Speech Tags
|
||||
|
||||
ul
|
||||
+Define("pos / pos_")
|
||||
| A coarse-grained, less detailed tag that represents the word-class of the token. The set of #[code .pos] tags are consistent across languages. The available tags are ADJ, ADP, ADV, AUX, CONJ, DET, INTJ, NOUN, NUM, PART, PRON, PROPN, PUNCT, SCONJ, SYM, VERB, X, EOL, SPACE.
|
||||
|
||||
ul
|
||||
+Define("tag / tag_")
|
||||
| A fine-grained, more detailed tag that represents the word-class and some basic morphological information for the token. These tags are primarily designed to be good features for subsequent models, particularly the syntactic parser. They are language and treebank dependent. The tagger is trained to predict these fine-grained tags, and then a mapping table is used to reduce them to the coarse-grained #[code .pos] tags.
|
||||
|
||||
details(open=true)
|
||||
summary: h4 Navigating the Parse Tree
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user