mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-24 04:31:17 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
80a5146ec2
|
@ -229,7 +229,7 @@ Compile from source
|
|||
The other way to install spaCy is to clone its
|
||||
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
|
||||
source. That is the common way if you want to make changes to the code base.
|
||||
You'll need to make sure that you have a development enviroment consisting of a
|
||||
You'll need to make sure that you have a development environment consisting of a
|
||||
Python distribution including header files, a compiler,
|
||||
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
|
||||
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
|
||||
|
|
|
@ -3,15 +3,21 @@ from __future__ import print_function
|
|||
# NB! This breaks in plac on Python 2!!
|
||||
#from __future__ import unicode_literals
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import plac
|
||||
import sys
|
||||
from spacy.cli import download, link, info, package, train, convert
|
||||
from spacy.cli import download, link, info, package, train, convert, model
|
||||
from spacy.util import prints
|
||||
|
||||
commands = {'download': download, 'link': link, 'info': info, 'train': train,
|
||||
'convert': convert, 'package': package}
|
||||
commands = {
|
||||
'download': download,
|
||||
'link': link,
|
||||
'info': info,
|
||||
'train': train,
|
||||
'convert': convert,
|
||||
'package': package,
|
||||
'model': model
|
||||
}
|
||||
if len(sys.argv) == 1:
|
||||
prints(', '.join(commands), title="Available commands", exits=1)
|
||||
command = sys.argv.pop(1)
|
||||
|
@ -19,5 +25,7 @@ if __name__ == '__main__':
|
|||
if command in commands:
|
||||
plac.call(commands[command])
|
||||
else:
|
||||
prints("Available: %s" % ', '.join(commands),
|
||||
title="Unknown command: %s" % command, exits=1)
|
||||
prints(
|
||||
"Available: %s" % ', '.join(commands),
|
||||
title="Unknown command: %s" % command,
|
||||
exits=1)
|
||||
|
|
|
@ -4,3 +4,4 @@ from .link import link
|
|||
from .package import package
|
||||
from .train import train
|
||||
from .convert import convert
|
||||
from .model import model
|
||||
|
|
119
spacy/cli/model.py
Normal file
119
spacy/cli/model.py
Normal file
|
@ -0,0 +1,119 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import gzip
|
||||
import math
|
||||
from ast import literal_eval
|
||||
from pathlib import Path
|
||||
from preshed.counter import PreshCounter
|
||||
|
||||
import spacy
|
||||
from ..compat import fix_text
|
||||
from .. import util
|
||||
|
||||
|
||||
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data):
|
||||
model_path = Path(model_dir)
|
||||
freqs_path = Path(freqs_data)
|
||||
clusters_path = Path(clusters_data) if clusters_data else None
|
||||
vectors_path = Path(vectors_data) if vectors_data else None
|
||||
|
||||
check_dirs(freqs_path, clusters_path, vectors_path)
|
||||
# vocab = util.get_lang_class(lang).Defaults.create_vocab()
|
||||
nlp = spacy.blank(lang)
|
||||
vocab = nlp.vocab
|
||||
probs, oov_prob = read_probs(freqs_path)
|
||||
clusters = read_clusters(clusters_path) if clusters_path else {}
|
||||
populate_vocab(vocab, clusters, probs, oov_prob)
|
||||
create_model(model_path, nlp)
|
||||
|
||||
|
||||
def create_model(model_path, model):
|
||||
if not model_path.exists():
|
||||
model_path.mkdir()
|
||||
model.to_disk(model_path.as_posix())
|
||||
|
||||
|
||||
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
|
||||
counts = PreshCounter()
|
||||
total = 0
|
||||
freqs_file = check_unzip(freqs_path)
|
||||
for i, line in enumerate(freqs_file):
|
||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||
freq = int(freq)
|
||||
counts.inc(i + 1, freq)
|
||||
total += freq
|
||||
counts.smooth()
|
||||
log_total = math.log(total)
|
||||
freqs_file = check_unzip(freqs_path)
|
||||
probs = {}
|
||||
for line in freqs_file:
|
||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||
doc_freq = int(doc_freq)
|
||||
freq = int(freq)
|
||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(
|
||||
key) < max_length:
|
||||
word = literal_eval(key)
|
||||
smooth_count = counts.smoother(int(freq))
|
||||
probs[word] = math.log(smooth_count) - log_total
|
||||
oov_prob = math.log(counts.smoother(0)) - log_total
|
||||
return probs, oov_prob
|
||||
|
||||
|
||||
def read_clusters(clusters_path):
|
||||
clusters = {}
|
||||
with clusters_path.open() as f:
|
||||
for line in f:
|
||||
try:
|
||||
cluster, word, freq = line.split()
|
||||
word = fix_text(word)
|
||||
except ValueError:
|
||||
continue
|
||||
# If the clusterer has only seen the word a few times, its
|
||||
# cluster is unreliable.
|
||||
if int(freq) >= 3:
|
||||
clusters[word] = cluster
|
||||
else:
|
||||
clusters[word] = '0'
|
||||
# Expand clusters with re-casing
|
||||
for word, cluster in list(clusters.items()):
|
||||
if word.lower() not in clusters:
|
||||
clusters[word.lower()] = cluster
|
||||
if word.title() not in clusters:
|
||||
clusters[word.title()] = cluster
|
||||
if word.upper() not in clusters:
|
||||
clusters[word.upper()] = cluster
|
||||
return clusters
|
||||
|
||||
|
||||
def populate_vocab(vocab, clusters, probs, oov_prob):
|
||||
for word, prob in reversed(
|
||||
sorted(list(probs.items()), key=lambda item: item[1])):
|
||||
lexeme = vocab[word]
|
||||
lexeme.prob = prob
|
||||
lexeme.is_oov = False
|
||||
# Decode as a little-endian string, so that we can do & 15 to get
|
||||
# the first 4 bits. See _parse_features.pyx
|
||||
if word in clusters:
|
||||
lexeme.cluster = int(clusters[word][::-1], 2)
|
||||
else:
|
||||
lexeme.cluster = 0
|
||||
|
||||
|
||||
def check_unzip(file_path):
|
||||
file_path_str = file_path.as_posix()
|
||||
if file_path_str.endswith('gz'):
|
||||
return gzip.open(file_path_str)
|
||||
else:
|
||||
return file_path.open()
|
||||
|
||||
|
||||
def check_dirs(freqs_data, clusters_data, vectors_data):
|
||||
if not freqs_data.is_file():
|
||||
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
|
||||
if clusters_data and not clusters_data.is_file():
|
||||
util.sys_exit(
|
||||
clusters_data.as_posix(), title="No Brown clusters file found")
|
||||
if vectors_data and not vectors_data.is_file():
|
||||
util.sys_exit(
|
||||
vectors_data.as_posix(), title="No word vectors file found")
|
|
@ -406,11 +406,11 @@ cdef class GoldParse:
|
|||
if tags is None:
|
||||
tags = [None for _ in doc]
|
||||
if heads is None:
|
||||
heads = [token.i for token in doc]
|
||||
heads = [None for token in doc]
|
||||
if deps is None:
|
||||
deps = [None for _ in doc]
|
||||
if entities is None:
|
||||
entities = ['-' for _ in doc]
|
||||
entities = [None for _ in doc]
|
||||
elif len(entities) == 0:
|
||||
entities = ['O' for _ in doc]
|
||||
elif not isinstance(entities[0], basestring):
|
||||
|
|
|
@ -200,6 +200,7 @@ class Language(object):
|
|||
else:
|
||||
flat_list.append(pipe)
|
||||
self.pipeline = flat_list
|
||||
self._optimizer = None
|
||||
|
||||
@property
|
||||
def meta(self):
|
||||
|
@ -278,7 +279,7 @@ class Language(object):
|
|||
return self.tokenizer(text)
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
||||
update_tensors=False):
|
||||
update_shared=False):
|
||||
"""Update the models in the pipeline.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
|
@ -298,6 +299,10 @@ class Language(object):
|
|||
"Got: %d, %d" % (len(docs), len(golds)))
|
||||
if len(docs) == 0:
|
||||
return
|
||||
if sgd is None:
|
||||
if self._optimizer is None:
|
||||
self._optimizer = Adam(Model.ops, 0.001)
|
||||
sgd = self._optimizer
|
||||
tok2vec = self.pipeline[0]
|
||||
feats = tok2vec.doc2feats(docs)
|
||||
grads = {}
|
||||
|
@ -312,12 +317,13 @@ class Language(object):
|
|||
continue
|
||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||
drop=drop, sgd=get_grads, losses=losses)
|
||||
if update_tensors and d_tokvecses is not None:
|
||||
if update_shared and d_tokvecses is not None:
|
||||
for i, d_tv in enumerate(d_tokvecses):
|
||||
all_d_tokvecses[i] += d_tv
|
||||
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
||||
for key, (W, dW) in grads.items():
|
||||
sgd(W, dW, key=key)
|
||||
if update_shared and bp_tokvecses is not None:
|
||||
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
||||
for key, (W, dW) in grads.items():
|
||||
sgd(W, dW, key=key)
|
||||
# Clear the tensor variable, to free GPU memory.
|
||||
# If we don't do this, the memory leak gets pretty
|
||||
# bad, because we may be holding part of a batch.
|
||||
|
@ -378,11 +384,11 @@ class Language(object):
|
|||
eps = util.env_opt('optimizer_eps', 1e-08)
|
||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||
beta2=beta2, eps=eps)
|
||||
optimizer.max_grad_norm = max_grad_norm
|
||||
optimizer.device = device
|
||||
return optimizer
|
||||
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||
beta2=beta2, eps=eps)
|
||||
self._optimizer.max_grad_norm = max_grad_norm
|
||||
self._optimizer.device = device
|
||||
return self._optimizer
|
||||
|
||||
def evaluate(self, docs_golds):
|
||||
scorer = Scorer()
|
||||
|
|
|
@ -294,6 +294,8 @@ class NeuralTagger(BaseThincComponent):
|
|||
doc.is_tagged = True
|
||||
|
||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
docs, tokvecs = docs_tokvecs
|
||||
|
||||
if self.model.nI is None:
|
||||
|
@ -302,6 +304,8 @@ class NeuralTagger(BaseThincComponent):
|
|||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||
|
||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||
if losses is not None:
|
||||
losses[self.name] += loss
|
||||
return d_tokvecs
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
|
|
|
@ -113,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
|
||||
def has_gold(self, GoldParse gold, start=0, end=None):
|
||||
end = end or len(gold.ner)
|
||||
if all([tag == '-' for tag in gold.ner[start:end]]):
|
||||
if all([tag in ('-', None) for tag in gold.ner[start:end]]):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
|
|
@ -483,6 +483,9 @@ cdef class Parser:
|
|||
return beams
|
||||
|
||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||
docs_tokvecs, golds = self._filter_unlabelled(docs_tokvecs, golds)
|
||||
if not golds:
|
||||
return None
|
||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
||||
return self.update_beam(docs_tokvecs, golds,
|
||||
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||
|
@ -555,6 +558,9 @@ cdef class Parser:
|
|||
|
||||
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
|
||||
drop=0., sgd=None, losses=None):
|
||||
docs_tokvecs, golds = self._filter_unlabelled(docs_tokvecs, golds)
|
||||
if not golds:
|
||||
return None
|
||||
if width is None:
|
||||
width = self.cfg.get('beam_width', 2)
|
||||
if density is None:
|
||||
|
@ -605,6 +611,15 @@ cdef class Parser:
|
|||
bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||
return d_tokvecs
|
||||
|
||||
def _filter_unlabelled(self, docs_tokvecs, golds):
|
||||
'''Remove inputs that have no relevant labels before update'''
|
||||
has_golds = [self.moves.has_gold(gold) for gold in golds]
|
||||
docs, tokvecs = docs_tokvecs
|
||||
docs = [docs[i] for i, has_gold in enumerate(has_golds) if has_gold]
|
||||
tokvecs = [tokvecs[i] for i, has_gold in enumerate(has_golds) if has_gold]
|
||||
golds = [golds[i] for i, has_gold in enumerate(has_golds) if has_gold]
|
||||
return (docs, tokvecs), golds
|
||||
|
||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||
"""Make a square batch, of length equal to the shortest doc. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
|
|
|
@ -205,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token.
|
|||
|
||||
p
|
||||
| The token within the span that's highest in the parse tree. If there's a
|
||||
| tie, the earlist is prefered.
|
||||
| tie, the earliest is preferred.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
|
|
|
@ -39,7 +39,7 @@ p
|
|||
+h(2, "special-cases") Adding special case tokenization rules
|
||||
|
||||
p
|
||||
| Most domains have at least some idiosyncracies that require custom
|
||||
| Most domains have at least some idiosyncrasies that require custom
|
||||
| tokenization rules. This could be very certain expressions, or
|
||||
| abbreviations only used in this specific field.
|
||||
|
||||
|
|
|
@ -109,7 +109,7 @@ p
|
|||
| The other way to install spaCy is to clone its
|
||||
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
|
||||
| the common way if you want to make changes to the code base. You'll need to
|
||||
| make sure that you have a development enviroment consisting of a Python
|
||||
| make sure that you have a development environment consisting of a Python
|
||||
| distribution including header files, a compiler,
|
||||
| #[+a("https://pip.pypa.io/en/latest/installing/") pip],
|
||||
| #[+a("https://virtualenv.pypa.io/") virtualenv] and
|
||||
|
|
|
@ -40,7 +40,7 @@ p
|
|||
+cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
|
||||
|
||||
+row
|
||||
+cell I read the paper yesteday
|
||||
+cell I read the paper yesterday
|
||||
+cell read
|
||||
+cell read
|
||||
+cell verb
|
||||
|
|
|
@ -94,7 +94,7 @@ p
|
|||
| is mostly intended as a convenient, interactive wrapper. It performs
|
||||
| compatibility checks and prints detailed error messages and warnings.
|
||||
| However, if you're downloading models as part of an automated build
|
||||
| process, this only adds an unecessary layer of complexity. If you know
|
||||
| process, this only adds an unnecessary layer of complexity. If you know
|
||||
| which models your application needs, you should be specifying them directly.
|
||||
|
||||
p
|
||||
|
|
Loading…
Reference in New Issue
Block a user