mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Tidy up language, lemmatizer and scorer
This commit is contained in:
parent
778212efea
commit
91899d337b
|
@ -11,21 +11,18 @@ from collections import OrderedDict
|
||||||
import itertools
|
import itertools
|
||||||
import weakref
|
import weakref
|
||||||
import functools
|
import functools
|
||||||
import tqdm
|
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .tagger import Tagger
|
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
from .pipeline import DependencyParser, Tensorizer, Tagger
|
from .pipeline import DependencyParser, Tensorizer, Tagger
|
||||||
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
|
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
|
||||||
|
from .compat import json_dumps, izip
|
||||||
from .compat import json_dumps, izip, copy_reg
|
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from ._ml import link_vectors_to_models
|
from ._ml import link_vectors_to_models
|
||||||
from .attrs import IS_STOP
|
from .attrs import IS_STOP
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||||
from .lang.tag_map import TAG_MAP
|
from .lang.tag_map import TAG_MAP
|
||||||
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
||||||
|
@ -57,16 +54,18 @@ class BaseDefaults(object):
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
rules = cls.tokenizer_exceptions
|
rules = cls.tokenizer_exceptions
|
||||||
token_match = cls.token_match
|
token_match = cls.token_match
|
||||||
prefix_search = util.compile_prefix_regex(cls.prefixes).search \
|
prefix_search = (util.compile_prefix_regex(cls.prefixes).search
|
||||||
if cls.prefixes else None
|
if cls.prefixes else None)
|
||||||
suffix_search = util.compile_suffix_regex(cls.suffixes).search \
|
suffix_search = (util.compile_suffix_regex(cls.suffixes).search
|
||||||
if cls.suffixes else None
|
if cls.suffixes else None)
|
||||||
infix_finditer = util.compile_infix_regex(cls.infixes).finditer \
|
infix_finditer = (util.compile_infix_regex(cls.infixes).finditer
|
||||||
if cls.infixes else None
|
if cls.infixes else None)
|
||||||
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
return Tokenizer(vocab, rules=rules,
|
return Tokenizer(vocab, rules=rules,
|
||||||
prefix_search=prefix_search, suffix_search=suffix_search,
|
prefix_search=prefix_search,
|
||||||
infix_finditer=infix_finditer, token_match=token_match)
|
suffix_search=suffix_search,
|
||||||
|
infix_finditer=infix_finditer,
|
||||||
|
token_match=token_match)
|
||||||
|
|
||||||
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
|
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
@ -98,7 +97,7 @@ class Language(object):
|
||||||
|
|
||||||
factories = {
|
factories = {
|
||||||
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
||||||
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
|
'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
|
||||||
'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
||||||
'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
||||||
'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
||||||
|
@ -218,14 +217,14 @@ class Language(object):
|
||||||
def add_pipe(self, component, name=None, before=None, after=None,
|
def add_pipe(self, component, name=None, before=None, after=None,
|
||||||
first=None, last=None):
|
first=None, last=None):
|
||||||
"""Add a component to the processing pipeline. Valid components are
|
"""Add a component to the processing pipeline. Valid components are
|
||||||
callables that take a `Doc` object, modify it and return it. Only one of
|
callables that take a `Doc` object, modify it and return it. Only one
|
||||||
before, after, first or last can be set. Default behaviour is "last".
|
of before/after/first/last can be set. Default behaviour is "last".
|
||||||
|
|
||||||
component (callable): The pipeline component.
|
component (callable): The pipeline component.
|
||||||
name (unicode): Name of pipeline component. Overwrites existing
|
name (unicode): Name of pipeline component. Overwrites existing
|
||||||
component.name attribute if available. If no name is set and
|
component.name attribute if available. If no name is set and
|
||||||
the component exposes no name attribute, component.__name__ is
|
the component exposes no name attribute, component.__name__ is
|
||||||
used. An error is raised if the name already exists in the pipeline.
|
used. An error is raised if a name already exists in the pipeline.
|
||||||
before (unicode): Component name to insert component directly before.
|
before (unicode): Component name to insert component directly before.
|
||||||
after (unicode): Component name to insert component directly after.
|
after (unicode): Component name to insert component directly after.
|
||||||
first (bool): Insert component first / not first in the pipeline.
|
first (bool): Insert component first / not first in the pipeline.
|
||||||
|
@ -240,7 +239,8 @@ class Language(object):
|
||||||
name = component.name
|
name = component.name
|
||||||
elif hasattr(component, '__name__'):
|
elif hasattr(component, '__name__'):
|
||||||
name = component.__name__
|
name = component.__name__
|
||||||
elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'):
|
elif (hasattr(component, '__class__') and
|
||||||
|
hasattr(component.__class__, '__name__')):
|
||||||
name = component.__class__.__name__
|
name = component.__class__.__name__
|
||||||
else:
|
else:
|
||||||
name = repr(component)
|
name = repr(component)
|
||||||
|
@ -269,7 +269,7 @@ class Language(object):
|
||||||
`name in nlp.pipe_names`.
|
`name in nlp.pipe_names`.
|
||||||
|
|
||||||
name (unicode): Name of the component.
|
name (unicode): Name of the component.
|
||||||
RETURNS (bool): Whether a component of that name exists in the pipeline.
|
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
||||||
"""
|
"""
|
||||||
return name in self.pipe_names
|
return name in self.pipe_names
|
||||||
|
|
||||||
|
@ -332,15 +332,12 @@ class Language(object):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def disable_pipes(self, *names):
|
def disable_pipes(self, *names):
|
||||||
'''Disable one or more pipeline components.
|
"""Disable one or more pipeline components. If used as a context
|
||||||
|
manager, the pipeline will be restored to the initial state at the end
|
||||||
If used as a context manager, the pipeline will be restored to the initial
|
of the block. Otherwise, a DisabledPipes object is returned, that has
|
||||||
state at the end of the block. Otherwise, a DisabledPipes object is
|
a `.restore()` method you can use to undo your changes.
|
||||||
returned, that has a `.restore()` method you can use to undo your
|
|
||||||
changes.
|
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
|
|
||||||
>>> nlp.add_pipe('parser')
|
>>> nlp.add_pipe('parser')
|
||||||
>>> nlp.add_pipe('tagger')
|
>>> nlp.add_pipe('tagger')
|
||||||
>>> with nlp.disable_pipes('parser', 'tagger'):
|
>>> with nlp.disable_pipes('parser', 'tagger'):
|
||||||
|
@ -351,7 +348,7 @@ class Language(object):
|
||||||
>>> assert not nlp.has_pipe('parser')
|
>>> assert not nlp.has_pipe('parser')
|
||||||
>>> disabled.restore()
|
>>> disabled.restore()
|
||||||
>>> assert nlp.has_pipe('parser')
|
>>> assert nlp.has_pipe('parser')
|
||||||
'''
|
"""
|
||||||
return DisabledPipes(self, *names)
|
return DisabledPipes(self, *names)
|
||||||
|
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
|
@ -367,14 +364,14 @@ class Language(object):
|
||||||
RETURNS (dict): Results from the update.
|
RETURNS (dict): Results from the update.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
>>> with nlp.begin_training(gold) as (trainer, optimizer):
|
||||||
>>> for epoch in trainer.epochs(gold):
|
>>> for epoch in trainer.epochs(gold):
|
||||||
>>> for docs, golds in epoch:
|
>>> for docs, golds in epoch:
|
||||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||||
"""
|
"""
|
||||||
if len(docs) != len(golds):
|
if len(docs) != len(golds):
|
||||||
raise IndexError("Update expects same number of docs and golds "
|
raise IndexError("Update expects same number of docs and golds "
|
||||||
"Got: %d, %d" % (len(docs), len(golds)))
|
"Got: %d, %d" % (len(docs), len(golds)))
|
||||||
if len(docs) == 0:
|
if len(docs) == 0:
|
||||||
return
|
return
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
|
@ -382,8 +379,10 @@ class Language(object):
|
||||||
self._optimizer = Adam(Model.ops, 0.001)
|
self._optimizer = Adam(Model.ops, 0.001)
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
grads = {}
|
grads = {}
|
||||||
|
|
||||||
def get_grads(W, dW, key=None):
|
def get_grads(W, dW, key=None):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
|
|
||||||
pipes = list(self.pipeline)
|
pipes = list(self.pipeline)
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
for name, proc in pipes:
|
for name, proc in pipes:
|
||||||
|
@ -421,7 +420,7 @@ class Language(object):
|
||||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||||
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||||
beta2=beta2, eps=eps)
|
beta2=beta2, eps=eps)
|
||||||
self._optimizer.max_grad_norm = max_grad_norm
|
self._optimizer.max_grad_norm = max_grad_norm
|
||||||
self._optimizer.device = device
|
self._optimizer.device = device
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
@ -461,7 +460,7 @@ class Language(object):
|
||||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||||
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||||
beta2=beta2, eps=eps)
|
beta2=beta2, eps=eps)
|
||||||
self._optimizer.max_grad_norm = max_grad_norm
|
self._optimizer.max_grad_norm = max_grad_norm
|
||||||
self._optimizer.device = device
|
self._optimizer.device = device
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
@ -512,17 +511,17 @@ class Language(object):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
|
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
|
||||||
disable=[]):
|
disable=[]):
|
||||||
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||||
GIL-free multi-threading.
|
Supports GIL-free multi-threading.
|
||||||
|
|
||||||
texts (iterator): A sequence of texts to process.
|
texts (iterator): A sequence of texts to process.
|
||||||
as_tuples (bool):
|
as_tuples (bool):
|
||||||
If set to True, inputs should be a sequence of
|
If set to True, inputs should be a sequence of
|
||||||
(text, context) tuples. Output will then be a sequence of
|
(text, context) tuples. Output will then be a sequence of
|
||||||
(doc, context) tuples. Defaults to False.
|
(doc, context) tuples. Defaults to False.
|
||||||
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
n_threads (int): The number of worker threads to use. If -1, OpenMP
|
||||||
decide how many to use at run time. Default is 2.
|
will decide how many to use at run time. Default is 2.
|
||||||
batch_size (int): The number of texts to buffer.
|
batch_size (int): The number of texts to buffer.
|
||||||
disable (list): Names of the pipeline components to disable.
|
disable (list): Names of the pipeline components to disable.
|
||||||
YIELDS (Doc): Documents in the order of the original text.
|
YIELDS (Doc): Documents in the order of the original text.
|
||||||
|
@ -546,7 +545,8 @@ class Language(object):
|
||||||
if name in disable:
|
if name in disable:
|
||||||
continue
|
continue
|
||||||
if hasattr(proc, 'pipe'):
|
if hasattr(proc, 'pipe'):
|
||||||
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
|
docs = proc.pipe(docs, n_threads=n_threads,
|
||||||
|
batch_size=batch_size)
|
||||||
else:
|
else:
|
||||||
# Apply the function, but yield the doc
|
# Apply the function, but yield the doc
|
||||||
docs = _pipe(proc, docs)
|
docs = _pipe(proc, docs)
|
||||||
|
@ -583,7 +583,7 @@ class Language(object):
|
||||||
will include the model.
|
will include the model.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
it doesn't exist. Paths may be strings or `Path`-like objects.
|
||||||
disable (list): Names of pipeline components to disable and prevent
|
disable (list): Names of pipeline components to disable and prevent
|
||||||
from being saved.
|
from being saved.
|
||||||
|
|
||||||
|
@ -682,7 +682,7 @@ class Language(object):
|
||||||
|
|
||||||
|
|
||||||
class DisabledPipes(list):
|
class DisabledPipes(list):
|
||||||
'''Manager for temporary pipeline disabling.'''
|
"""Manager for temporary pipeline disabling."""
|
||||||
def __init__(self, nlp, *names):
|
def __init__(self, nlp, *names):
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.names = names
|
self.names = names
|
||||||
|
@ -702,7 +702,8 @@ class DisabledPipes(list):
|
||||||
def restore(self):
|
def restore(self):
|
||||||
'''Restore the pipeline to its state when DisabledPipes was created.'''
|
'''Restore the pipeline to its state when DisabledPipes was created.'''
|
||||||
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
|
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
|
||||||
unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)]
|
unexpected = [name for name, pipe in current
|
||||||
|
if not self.nlp.has_pipe(name)]
|
||||||
if unexpected:
|
if unexpected:
|
||||||
# Don't change the pipeline if we're raising an error.
|
# Don't change the pipeline if we're raising an error.
|
||||||
self.nlp.pipeline = current
|
self.nlp.pipeline = current
|
||||||
|
|
|
@ -43,16 +43,15 @@ class Lemmatizer(object):
|
||||||
morphology = {} if morphology is None else morphology
|
morphology = {} if morphology is None else morphology
|
||||||
others = [key for key in morphology
|
others = [key for key in morphology
|
||||||
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
|
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
|
||||||
true_morph_key = morphology.get('morph', 0)
|
|
||||||
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
||||||
return True
|
return True
|
||||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||||
# morphology
|
# morphology
|
||||||
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
|
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and
|
||||||
morphology.get('Tense') == 'pres' and \
|
morphology.get('Tense') == 'pres' and
|
||||||
morphology.get('Number') is None and \
|
morphology.get('Number') is None and
|
||||||
not others):
|
not others):
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||||
|
@ -89,9 +88,6 @@ class Lemmatizer(object):
|
||||||
def lemmatize(string, index, exceptions, rules):
|
def lemmatize(string, index, exceptions, rules):
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
# TODO: Is this correct? See discussion in Issue #435.
|
|
||||||
#if string in index:
|
|
||||||
# forms.append(string)
|
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
oov_forms = []
|
oov_forms = []
|
||||||
if not forms:
|
if not forms:
|
||||||
|
|
|
@ -74,8 +74,11 @@ class Scorer(object):
|
||||||
@property
|
@property
|
||||||
def scores(self):
|
def scores(self):
|
||||||
return {
|
return {
|
||||||
'uas': self.uas, 'las': self.las,
|
'uas': self.uas,
|
||||||
'ents_p': self.ents_p, 'ents_r': self.ents_r, 'ents_f': self.ents_f,
|
'las': self.las,
|
||||||
|
'ents_p': self.ents_p,
|
||||||
|
'ents_r': self.ents_r,
|
||||||
|
'ents_f': self.ents_f,
|
||||||
'tags_acc': self.tags_acc,
|
'tags_acc': self.tags_acc,
|
||||||
'token_acc': self.token_acc
|
'token_acc': self.token_acc
|
||||||
}
|
}
|
||||||
|
@ -85,7 +88,8 @@ class Scorer(object):
|
||||||
|
|
||||||
gold_deps = set()
|
gold_deps = set()
|
||||||
gold_tags = set()
|
gold_tags = set()
|
||||||
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
|
gold_ents = set(tags_to_entities([annot[-1]
|
||||||
|
for annot in gold.orig_annot]))
|
||||||
for id_, word, tag, head, dep, ner in gold.orig_annot:
|
for id_, word, tag, head, dep, ner in gold.orig_annot:
|
||||||
gold_tags.add((id_, tag))
|
gold_tags.add((id_, tag))
|
||||||
if dep not in (None, "") and dep.lower() not in punct_labels:
|
if dep not in (None, "") and dep.lower() not in punct_labels:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user