mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
80a5146ec2
|
@ -229,7 +229,7 @@ Compile from source
|
||||||
The other way to install spaCy is to clone its
|
The other way to install spaCy is to clone its
|
||||||
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
|
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
|
||||||
source. That is the common way if you want to make changes to the code base.
|
source. That is the common way if you want to make changes to the code base.
|
||||||
You'll need to make sure that you have a development enviroment consisting of a
|
You'll need to make sure that you have a development environment consisting of a
|
||||||
Python distribution including header files, a compiler,
|
Python distribution including header files, a compiler,
|
||||||
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
|
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
|
||||||
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
|
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
|
||||||
|
|
|
@ -3,15 +3,21 @@ from __future__ import print_function
|
||||||
# NB! This breaks in plac on Python 2!!
|
# NB! This breaks in plac on Python 2!!
|
||||||
#from __future__ import unicode_literals
|
#from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
from spacy.cli import download, link, info, package, train, convert
|
from spacy.cli import download, link, info, package, train, convert, model
|
||||||
from spacy.util import prints
|
from spacy.util import prints
|
||||||
|
|
||||||
commands = {'download': download, 'link': link, 'info': info, 'train': train,
|
commands = {
|
||||||
'convert': convert, 'package': package}
|
'download': download,
|
||||||
|
'link': link,
|
||||||
|
'info': info,
|
||||||
|
'train': train,
|
||||||
|
'convert': convert,
|
||||||
|
'package': package,
|
||||||
|
'model': model
|
||||||
|
}
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
prints(', '.join(commands), title="Available commands", exits=1)
|
prints(', '.join(commands), title="Available commands", exits=1)
|
||||||
command = sys.argv.pop(1)
|
command = sys.argv.pop(1)
|
||||||
|
@ -19,5 +25,7 @@ if __name__ == '__main__':
|
||||||
if command in commands:
|
if command in commands:
|
||||||
plac.call(commands[command])
|
plac.call(commands[command])
|
||||||
else:
|
else:
|
||||||
prints("Available: %s" % ', '.join(commands),
|
prints(
|
||||||
title="Unknown command: %s" % command, exits=1)
|
"Available: %s" % ', '.join(commands),
|
||||||
|
title="Unknown command: %s" % command,
|
||||||
|
exits=1)
|
||||||
|
|
|
@ -4,3 +4,4 @@ from .link import link
|
||||||
from .package import package
|
from .package import package
|
||||||
from .train import train
|
from .train import train
|
||||||
from .convert import convert
|
from .convert import convert
|
||||||
|
from .model import model
|
||||||
|
|
119
spacy/cli/model.py
Normal file
119
spacy/cli/model.py
Normal file
|
@ -0,0 +1,119 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import math
|
||||||
|
from ast import literal_eval
|
||||||
|
from pathlib import Path
|
||||||
|
from preshed.counter import PreshCounter
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from ..compat import fix_text
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data):
|
||||||
|
model_path = Path(model_dir)
|
||||||
|
freqs_path = Path(freqs_data)
|
||||||
|
clusters_path = Path(clusters_data) if clusters_data else None
|
||||||
|
vectors_path = Path(vectors_data) if vectors_data else None
|
||||||
|
|
||||||
|
check_dirs(freqs_path, clusters_path, vectors_path)
|
||||||
|
# vocab = util.get_lang_class(lang).Defaults.create_vocab()
|
||||||
|
nlp = spacy.blank(lang)
|
||||||
|
vocab = nlp.vocab
|
||||||
|
probs, oov_prob = read_probs(freqs_path)
|
||||||
|
clusters = read_clusters(clusters_path) if clusters_path else {}
|
||||||
|
populate_vocab(vocab, clusters, probs, oov_prob)
|
||||||
|
create_model(model_path, nlp)
|
||||||
|
|
||||||
|
|
||||||
|
def create_model(model_path, model):
|
||||||
|
if not model_path.exists():
|
||||||
|
model_path.mkdir()
|
||||||
|
model.to_disk(model_path.as_posix())
|
||||||
|
|
||||||
|
|
||||||
|
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
|
||||||
|
counts = PreshCounter()
|
||||||
|
total = 0
|
||||||
|
freqs_file = check_unzip(freqs_path)
|
||||||
|
for i, line in enumerate(freqs_file):
|
||||||
|
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||||
|
freq = int(freq)
|
||||||
|
counts.inc(i + 1, freq)
|
||||||
|
total += freq
|
||||||
|
counts.smooth()
|
||||||
|
log_total = math.log(total)
|
||||||
|
freqs_file = check_unzip(freqs_path)
|
||||||
|
probs = {}
|
||||||
|
for line in freqs_file:
|
||||||
|
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||||
|
doc_freq = int(doc_freq)
|
||||||
|
freq = int(freq)
|
||||||
|
if doc_freq >= min_doc_freq and freq >= min_freq and len(
|
||||||
|
key) < max_length:
|
||||||
|
word = literal_eval(key)
|
||||||
|
smooth_count = counts.smoother(int(freq))
|
||||||
|
probs[word] = math.log(smooth_count) - log_total
|
||||||
|
oov_prob = math.log(counts.smoother(0)) - log_total
|
||||||
|
return probs, oov_prob
|
||||||
|
|
||||||
|
|
||||||
|
def read_clusters(clusters_path):
|
||||||
|
clusters = {}
|
||||||
|
with clusters_path.open() as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
cluster, word, freq = line.split()
|
||||||
|
word = fix_text(word)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
# If the clusterer has only seen the word a few times, its
|
||||||
|
# cluster is unreliable.
|
||||||
|
if int(freq) >= 3:
|
||||||
|
clusters[word] = cluster
|
||||||
|
else:
|
||||||
|
clusters[word] = '0'
|
||||||
|
# Expand clusters with re-casing
|
||||||
|
for word, cluster in list(clusters.items()):
|
||||||
|
if word.lower() not in clusters:
|
||||||
|
clusters[word.lower()] = cluster
|
||||||
|
if word.title() not in clusters:
|
||||||
|
clusters[word.title()] = cluster
|
||||||
|
if word.upper() not in clusters:
|
||||||
|
clusters[word.upper()] = cluster
|
||||||
|
return clusters
|
||||||
|
|
||||||
|
|
||||||
|
def populate_vocab(vocab, clusters, probs, oov_prob):
|
||||||
|
for word, prob in reversed(
|
||||||
|
sorted(list(probs.items()), key=lambda item: item[1])):
|
||||||
|
lexeme = vocab[word]
|
||||||
|
lexeme.prob = prob
|
||||||
|
lexeme.is_oov = False
|
||||||
|
# Decode as a little-endian string, so that we can do & 15 to get
|
||||||
|
# the first 4 bits. See _parse_features.pyx
|
||||||
|
if word in clusters:
|
||||||
|
lexeme.cluster = int(clusters[word][::-1], 2)
|
||||||
|
else:
|
||||||
|
lexeme.cluster = 0
|
||||||
|
|
||||||
|
|
||||||
|
def check_unzip(file_path):
|
||||||
|
file_path_str = file_path.as_posix()
|
||||||
|
if file_path_str.endswith('gz'):
|
||||||
|
return gzip.open(file_path_str)
|
||||||
|
else:
|
||||||
|
return file_path.open()
|
||||||
|
|
||||||
|
|
||||||
|
def check_dirs(freqs_data, clusters_data, vectors_data):
|
||||||
|
if not freqs_data.is_file():
|
||||||
|
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
|
||||||
|
if clusters_data and not clusters_data.is_file():
|
||||||
|
util.sys_exit(
|
||||||
|
clusters_data.as_posix(), title="No Brown clusters file found")
|
||||||
|
if vectors_data and not vectors_data.is_file():
|
||||||
|
util.sys_exit(
|
||||||
|
vectors_data.as_posix(), title="No word vectors file found")
|
|
@ -406,11 +406,11 @@ cdef class GoldParse:
|
||||||
if tags is None:
|
if tags is None:
|
||||||
tags = [None for _ in doc]
|
tags = [None for _ in doc]
|
||||||
if heads is None:
|
if heads is None:
|
||||||
heads = [token.i for token in doc]
|
heads = [None for token in doc]
|
||||||
if deps is None:
|
if deps is None:
|
||||||
deps = [None for _ in doc]
|
deps = [None for _ in doc]
|
||||||
if entities is None:
|
if entities is None:
|
||||||
entities = ['-' for _ in doc]
|
entities = [None for _ in doc]
|
||||||
elif len(entities) == 0:
|
elif len(entities) == 0:
|
||||||
entities = ['O' for _ in doc]
|
entities = ['O' for _ in doc]
|
||||||
elif not isinstance(entities[0], basestring):
|
elif not isinstance(entities[0], basestring):
|
||||||
|
|
|
@ -200,6 +200,7 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
flat_list.append(pipe)
|
flat_list.append(pipe)
|
||||||
self.pipeline = flat_list
|
self.pipeline = flat_list
|
||||||
|
self._optimizer = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def meta(self):
|
def meta(self):
|
||||||
|
@ -278,7 +279,7 @@ class Language(object):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
||||||
update_tensors=False):
|
update_shared=False):
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -298,6 +299,10 @@ class Language(object):
|
||||||
"Got: %d, %d" % (len(docs), len(golds)))
|
"Got: %d, %d" % (len(docs), len(golds)))
|
||||||
if len(docs) == 0:
|
if len(docs) == 0:
|
||||||
return
|
return
|
||||||
|
if sgd is None:
|
||||||
|
if self._optimizer is None:
|
||||||
|
self._optimizer = Adam(Model.ops, 0.001)
|
||||||
|
sgd = self._optimizer
|
||||||
tok2vec = self.pipeline[0]
|
tok2vec = self.pipeline[0]
|
||||||
feats = tok2vec.doc2feats(docs)
|
feats = tok2vec.doc2feats(docs)
|
||||||
grads = {}
|
grads = {}
|
||||||
|
@ -312,12 +317,13 @@ class Language(object):
|
||||||
continue
|
continue
|
||||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||||
drop=drop, sgd=get_grads, losses=losses)
|
drop=drop, sgd=get_grads, losses=losses)
|
||||||
if update_tensors and d_tokvecses is not None:
|
if update_shared and d_tokvecses is not None:
|
||||||
for i, d_tv in enumerate(d_tokvecses):
|
for i, d_tv in enumerate(d_tokvecses):
|
||||||
all_d_tokvecses[i] += d_tv
|
all_d_tokvecses[i] += d_tv
|
||||||
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
if update_shared and bp_tokvecses is not None:
|
||||||
for key, (W, dW) in grads.items():
|
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
||||||
sgd(W, dW, key=key)
|
for key, (W, dW) in grads.items():
|
||||||
|
sgd(W, dW, key=key)
|
||||||
# Clear the tensor variable, to free GPU memory.
|
# Clear the tensor variable, to free GPU memory.
|
||||||
# If we don't do this, the memory leak gets pretty
|
# If we don't do this, the memory leak gets pretty
|
||||||
# bad, because we may be holding part of a batch.
|
# bad, because we may be holding part of a batch.
|
||||||
|
@ -378,11 +384,11 @@ class Language(object):
|
||||||
eps = util.env_opt('optimizer_eps', 1e-08)
|
eps = util.env_opt('optimizer_eps', 1e-08)
|
||||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||||
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||||
beta2=beta2, eps=eps)
|
beta2=beta2, eps=eps)
|
||||||
optimizer.max_grad_norm = max_grad_norm
|
self._optimizer.max_grad_norm = max_grad_norm
|
||||||
optimizer.device = device
|
self._optimizer.device = device
|
||||||
return optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def evaluate(self, docs_golds):
|
def evaluate(self, docs_golds):
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
|
|
|
@ -294,6 +294,8 @@ class NeuralTagger(BaseThincComponent):
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
if losses is not None and self.name not in losses:
|
||||||
|
losses[self.name] = 0.
|
||||||
docs, tokvecs = docs_tokvecs
|
docs, tokvecs = docs_tokvecs
|
||||||
|
|
||||||
if self.model.nI is None:
|
if self.model.nI is None:
|
||||||
|
@ -302,6 +304,8 @@ class NeuralTagger(BaseThincComponent):
|
||||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||||
|
|
||||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||||
|
if losses is not None:
|
||||||
|
losses[self.name] += loss
|
||||||
return d_tokvecs
|
return d_tokvecs
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
|
|
|
@ -113,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
||||||
def has_gold(self, GoldParse gold, start=0, end=None):
|
def has_gold(self, GoldParse gold, start=0, end=None):
|
||||||
end = end or len(gold.ner)
|
end = end or len(gold.ner)
|
||||||
if all([tag == '-' for tag in gold.ner[start:end]]):
|
if all([tag in ('-', None) for tag in gold.ner[start:end]]):
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -483,6 +483,9 @@ cdef class Parser:
|
||||||
return beams
|
return beams
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
docs_tokvecs, golds = self._filter_unlabelled(docs_tokvecs, golds)
|
||||||
|
if not golds:
|
||||||
|
return None
|
||||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
||||||
return self.update_beam(docs_tokvecs, golds,
|
return self.update_beam(docs_tokvecs, golds,
|
||||||
self.cfg['beam_width'], self.cfg['beam_density'],
|
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||||
|
@ -555,6 +558,9 @@ cdef class Parser:
|
||||||
|
|
||||||
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
|
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
|
||||||
drop=0., sgd=None, losses=None):
|
drop=0., sgd=None, losses=None):
|
||||||
|
docs_tokvecs, golds = self._filter_unlabelled(docs_tokvecs, golds)
|
||||||
|
if not golds:
|
||||||
|
return None
|
||||||
if width is None:
|
if width is None:
|
||||||
width = self.cfg.get('beam_width', 2)
|
width = self.cfg.get('beam_width', 2)
|
||||||
if density is None:
|
if density is None:
|
||||||
|
@ -605,6 +611,15 @@ cdef class Parser:
|
||||||
bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||||
return d_tokvecs
|
return d_tokvecs
|
||||||
|
|
||||||
|
def _filter_unlabelled(self, docs_tokvecs, golds):
|
||||||
|
'''Remove inputs that have no relevant labels before update'''
|
||||||
|
has_golds = [self.moves.has_gold(gold) for gold in golds]
|
||||||
|
docs, tokvecs = docs_tokvecs
|
||||||
|
docs = [docs[i] for i, has_gold in enumerate(has_golds) if has_gold]
|
||||||
|
tokvecs = [tokvecs[i] for i, has_gold in enumerate(has_golds) if has_gold]
|
||||||
|
golds = [golds[i] for i, has_gold in enumerate(has_golds) if has_gold]
|
||||||
|
return (docs, tokvecs), golds
|
||||||
|
|
||||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||||
|
|
|
@ -205,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token.
|
||||||
|
|
||||||
p
|
p
|
||||||
| The token within the span that's highest in the parse tree. If there's a
|
| The token within the span that's highest in the parse tree. If there's a
|
||||||
| tie, the earlist is prefered.
|
| tie, the earliest is preferred.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
doc = nlp(u'I like New York in Autumn.')
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
|
|
@ -39,7 +39,7 @@ p
|
||||||
+h(2, "special-cases") Adding special case tokenization rules
|
+h(2, "special-cases") Adding special case tokenization rules
|
||||||
|
|
||||||
p
|
p
|
||||||
| Most domains have at least some idiosyncracies that require custom
|
| Most domains have at least some idiosyncrasies that require custom
|
||||||
| tokenization rules. This could be very certain expressions, or
|
| tokenization rules. This could be very certain expressions, or
|
||||||
| abbreviations only used in this specific field.
|
| abbreviations only used in this specific field.
|
||||||
|
|
||||||
|
|
|
@ -109,7 +109,7 @@ p
|
||||||
| The other way to install spaCy is to clone its
|
| The other way to install spaCy is to clone its
|
||||||
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
|
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
|
||||||
| the common way if you want to make changes to the code base. You'll need to
|
| the common way if you want to make changes to the code base. You'll need to
|
||||||
| make sure that you have a development enviroment consisting of a Python
|
| make sure that you have a development environment consisting of a Python
|
||||||
| distribution including header files, a compiler,
|
| distribution including header files, a compiler,
|
||||||
| #[+a("https://pip.pypa.io/en/latest/installing/") pip],
|
| #[+a("https://pip.pypa.io/en/latest/installing/") pip],
|
||||||
| #[+a("https://virtualenv.pypa.io/") virtualenv] and
|
| #[+a("https://virtualenv.pypa.io/") virtualenv] and
|
||||||
|
|
|
@ -40,7 +40,7 @@ p
|
||||||
+cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
|
+cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell I read the paper yesteday
|
+cell I read the paper yesterday
|
||||||
+cell read
|
+cell read
|
||||||
+cell read
|
+cell read
|
||||||
+cell verb
|
+cell verb
|
||||||
|
|
|
@ -94,7 +94,7 @@ p
|
||||||
| is mostly intended as a convenient, interactive wrapper. It performs
|
| is mostly intended as a convenient, interactive wrapper. It performs
|
||||||
| compatibility checks and prints detailed error messages and warnings.
|
| compatibility checks and prints detailed error messages and warnings.
|
||||||
| However, if you're downloading models as part of an automated build
|
| However, if you're downloading models as part of an automated build
|
||||||
| process, this only adds an unecessary layer of complexity. If you know
|
| process, this only adds an unnecessary layer of complexity. If you know
|
||||||
| which models your application needs, you should be specifying them directly.
|
| which models your application needs, you should be specifying them directly.
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
Loading…
Reference in New Issue
Block a user