mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Merge remote-tracking branch 'upstream/master'
This commit is contained in:
commit
9a6811acc4
|
@ -24,4 +24,4 @@ install:
|
||||||
|
|
||||||
# run tests
|
# run tests
|
||||||
script:
|
script:
|
||||||
- "py.test tests/ website/tests/ -x"
|
- "py.test tests/ -x"
|
||||||
|
|
|
@ -148,8 +148,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
nlp.end_training(model_dir)
|
nlp.end_training(model_dir)
|
||||||
print('done')
|
print('done')
|
||||||
|
|
||||||
|
|
||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||||
beam_width=None):
|
beam_width=None, cand_preproc=None):
|
||||||
nlp = Language(data_dir=model_dir)
|
nlp = Language(data_dir=model_dir)
|
||||||
if beam_width is not None:
|
if beam_width is not None:
|
||||||
nlp.parser.cfg.beam_width = beam_width
|
nlp.parser.cfg.beam_width = beam_width
|
||||||
|
@ -166,16 +167,14 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False
|
||||||
nlp.entity(tokens)
|
nlp.entity(tokens)
|
||||||
nlp.parser(tokens)
|
nlp.parser(tokens)
|
||||||
else:
|
else:
|
||||||
tokens = nlp(raw_text, merge_mwes=False)
|
tokens = nlp(raw_text)
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
gold = GoldParse(tokens, annot_tuples)
|
||||||
scorer.score(tokens, gold, verbose=verbose)
|
scorer.score(tokens, gold, verbose=verbose)
|
||||||
return scorer
|
return scorer
|
||||||
|
|
||||||
|
|
||||||
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
|
def write_parses(Language, dev_loc, model_dir, out_loc):
|
||||||
nlp = Language(data_dir=model_dir)
|
nlp = Language(data_dir=model_dir)
|
||||||
if beam_width is not None:
|
|
||||||
nlp.parser.cfg.beam_width = beam_width
|
|
||||||
gold_tuples = read_json_file(dev_loc)
|
gold_tuples = read_json_file(dev_loc)
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
out_file = codecs.open(out_loc, 'w', 'utf8')
|
out_file = codecs.open(out_loc, 'w', 'utf8')
|
||||||
|
@ -188,14 +187,16 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
|
||||||
nlp.entity(tokens)
|
nlp.entity(tokens)
|
||||||
nlp.parser(tokens)
|
nlp.parser(tokens)
|
||||||
else:
|
else:
|
||||||
tokens = nlp(raw_text, merge_mwes=False)
|
tokens = nlp(raw_text)
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
#gold = GoldParse(tokens, annot_tuples)
|
||||||
scorer.score(tokens, gold, verbose=False)
|
#scorer.score(tokens, gold, verbose=False)
|
||||||
for t in tokens:
|
for sent in tokens.sents:
|
||||||
out_file.write(
|
for t in sent:
|
||||||
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
|
if not t.is_space:
|
||||||
)
|
out_file.write(
|
||||||
return scorer
|
'%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
|
||||||
|
)
|
||||||
|
out_file.write('\n')
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
@ -220,14 +221,15 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
|
||||||
gold_preproc=gold_preproc, n_sents=n_sents,
|
gold_preproc=gold_preproc, n_sents=n_sents,
|
||||||
corruption_level=corruption_level, n_iter=n_iter,
|
corruption_level=corruption_level, n_iter=n_iter,
|
||||||
verbose=verbose)
|
verbose=verbose)
|
||||||
#if out_loc:
|
if out_loc:
|
||||||
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
write_parses(English, dev_loc, model_dir, out_loc)
|
||||||
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
||||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
||||||
print('TOK', scorer.token_acc)
|
print('TOK', scorer.token_acc)
|
||||||
print('POS', scorer.tags_acc)
|
print('POS', scorer.tags_acc)
|
||||||
print('UAS', scorer.uas)
|
print('UAS', scorer.uas)
|
||||||
print('LAS', scorer.las)
|
print('LAS', scorer.las)
|
||||||
|
print('SBD', scorer.sbd_acc)
|
||||||
|
|
||||||
print('NER P', scorer.ents_p)
|
print('NER P', scorer.ents_p)
|
||||||
print('NER R', scorer.ents_r)
|
print('NER R', scorer.ents_r)
|
||||||
|
|
151
bin/parser/train_ud.py
Normal file
151
bin/parser/train_ud.py
Normal file
|
@ -0,0 +1,151 @@
|
||||||
|
import plac
|
||||||
|
import json
|
||||||
|
from os import path
|
||||||
|
import shutil
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
|
from spacy.syntax.util import Config
|
||||||
|
from spacy.gold import GoldParse
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.tagger import Tagger
|
||||||
|
from spacy.syntax.parser import Parser
|
||||||
|
from spacy.syntax.arc_eager import ArcEager
|
||||||
|
from spacy.syntax.parser import get_templates
|
||||||
|
from spacy.scorer import Scorer
|
||||||
|
|
||||||
|
from spacy.language import Language
|
||||||
|
|
||||||
|
from spacy.tagger import W_orth
|
||||||
|
|
||||||
|
TAGGER_TEMPLATES = (
|
||||||
|
(W_orth,),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from codecs import open
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TreebankParser(object):
|
||||||
|
@staticmethod
|
||||||
|
def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0):
|
||||||
|
dep_model_dir = path.join(model_dir, 'deps')
|
||||||
|
pos_model_dir = path.join(model_dir, 'pos')
|
||||||
|
if path.exists(dep_model_dir):
|
||||||
|
shutil.rmtree(dep_model_dir)
|
||||||
|
if path.exists(pos_model_dir):
|
||||||
|
shutil.rmtree(pos_model_dir)
|
||||||
|
os.mkdir(dep_model_dir)
|
||||||
|
os.mkdir(pos_model_dir)
|
||||||
|
|
||||||
|
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||||
|
labels=labels)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dir(cls, tag_map, model_dir):
|
||||||
|
vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
|
||||||
|
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||||
|
tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)
|
||||||
|
|
||||||
|
cfg = Config.read(path.join(model_dir, 'deps'), 'config')
|
||||||
|
parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
|
||||||
|
return cls(vocab, tokenizer, tagger, parser)
|
||||||
|
|
||||||
|
def __init__(self, vocab, tokenizer, tagger, parser):
|
||||||
|
self.vocab = vocab
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.tagger = tagger
|
||||||
|
self.parser = parser
|
||||||
|
|
||||||
|
def train(self, words, tags, heads, deps):
|
||||||
|
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||||
|
self.tagger.train(tokens, tags)
|
||||||
|
|
||||||
|
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||||
|
ids = range(len(words))
|
||||||
|
ner = ['O'] * len(words)
|
||||||
|
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)),
|
||||||
|
make_projective=False)
|
||||||
|
self.tagger(tokens)
|
||||||
|
if gold.is_projective:
|
||||||
|
try:
|
||||||
|
self.parser.train(tokens, gold)
|
||||||
|
except:
|
||||||
|
for id_, word, head, dep in zip(ids, words, heads, deps):
|
||||||
|
print(id_, word, head, dep)
|
||||||
|
raise
|
||||||
|
|
||||||
|
def __call__(self, words, tags=None):
|
||||||
|
tokens = self.tokenizer.tokens_from_list(list(words))
|
||||||
|
if tags is None:
|
||||||
|
self.tagger(tokens)
|
||||||
|
else:
|
||||||
|
self.tagger.tag_from_strings(tokens, tags)
|
||||||
|
self.parser(tokens)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def end_training(self, data_dir):
|
||||||
|
self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
|
||||||
|
self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
|
||||||
|
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
def read_conllx(loc):
|
||||||
|
with open(loc, 'r', 'utf8') as file_:
|
||||||
|
text = file_.read()
|
||||||
|
for sent in text.strip().split('\n\n'):
|
||||||
|
lines = sent.strip().split('\n')
|
||||||
|
if lines:
|
||||||
|
if lines[0].startswith('#'):
|
||||||
|
lines.pop(0)
|
||||||
|
tokens = []
|
||||||
|
for line in lines:
|
||||||
|
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
|
||||||
|
if '-' in id_:
|
||||||
|
continue
|
||||||
|
id_ = int(id_) - 1
|
||||||
|
head = (int(head) - 1) if head != '0' else id_
|
||||||
|
dep = 'ROOT' if dep == 'root' else dep
|
||||||
|
tokens.append((id_, word, tag, head, dep, 'O'))
|
||||||
|
tuples = zip(*tokens)
|
||||||
|
yield (None, [(tuples, [])])
|
||||||
|
|
||||||
|
|
||||||
|
def score_model(nlp, gold_docs, verbose=False):
|
||||||
|
scorer = Scorer()
|
||||||
|
for _, gold_doc in gold_docs:
|
||||||
|
for annot_tuples, _ in gold_doc:
|
||||||
|
tokens = nlp(list(annot_tuples[1]), tags=list(annot_tuples[2]))
|
||||||
|
gold = GoldParse(tokens, annot_tuples)
|
||||||
|
scorer.score(tokens, gold, verbose=verbose)
|
||||||
|
return scorer
|
||||||
|
|
||||||
|
|
||||||
|
def main(train_loc, dev_loc, model_dir, tag_map_loc):
|
||||||
|
with open(tag_map_loc) as file_:
|
||||||
|
tag_map = json.loads(file_.read())
|
||||||
|
train_sents = list(read_conllx(train_loc))
|
||||||
|
labels = ArcEager.get_labels(train_sents)
|
||||||
|
templates = get_templates('basic')
|
||||||
|
|
||||||
|
TreebankParser.setup_model_dir(model_dir, labels, templates)
|
||||||
|
|
||||||
|
nlp = TreebankParser.from_dir(tag_map, model_dir)
|
||||||
|
|
||||||
|
for itn in range(15):
|
||||||
|
for _, doc_sents in train_sents:
|
||||||
|
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
||||||
|
nlp.train(words, tags, heads, deps)
|
||||||
|
random.shuffle(train_sents)
|
||||||
|
scorer = score_model(nlp, read_conllx(dev_loc))
|
||||||
|
print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
|
||||||
|
nlp.end_training(model_dir)
|
||||||
|
scorer = score_model(nlp, read_conllx(dev_loc))
|
||||||
|
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
plac.call(main)
|
95
contributors/chrisdubois.md
Normal file
95
contributors/chrisdubois.md
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
Syllogism Contributor Agreement
|
||||||
|
===============================
|
||||||
|
|
||||||
|
This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
|
||||||
|
Agreement. The SCA applies to any contribution that you make to any product or
|
||||||
|
project managed by us (the “project”), and sets out the intellectual property
|
||||||
|
rights you grant to us in the contributed materials. The term “us” shall mean
|
||||||
|
Syllogism Co. The term "you" shall mean the person or entity identified below.
|
||||||
|
If you agree to be bound by these terms, fill in the information requested below
|
||||||
|
and include the filled-in version with your first pull-request, under the file
|
||||||
|
contrbutors/. The name of the file should be your GitHub username, with the
|
||||||
|
extension .md. For example, the user example_user would create the file
|
||||||
|
spaCy/contributors/example_user.md .
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
1. The term 'contribution' or ‘contributed materials’ means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual, documentation,
|
||||||
|
or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and registrations,
|
||||||
|
in your contribution:
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such assignment
|
||||||
|
is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual,
|
||||||
|
irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license
|
||||||
|
to exercise all rights under those copyrights. This includes, at our option, the
|
||||||
|
right to sublicense these same rights to third parties through multiple levels of
|
||||||
|
sublicensees or other licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your contribution
|
||||||
|
as if each of us were the sole owners, and if one of us makes a derivative work
|
||||||
|
of your contribution, the one who makes the derivative work (or has it made) will
|
||||||
|
be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution against
|
||||||
|
us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and exercise
|
||||||
|
all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the consent
|
||||||
|
of, pay or render an accounting to the other for any use or distribution of your
|
||||||
|
contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive,
|
||||||
|
worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer your
|
||||||
|
contribution in whole or in part, alone or in combination with
|
||||||
|
or included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through multiple
|
||||||
|
levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective on
|
||||||
|
the date you first submitted a contribution to us, even if your submission took
|
||||||
|
place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of authorship
|
||||||
|
and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any third
|
||||||
|
party's copyrights, trademarks, patents, or other intellectual property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and other
|
||||||
|
applicable export and import laws. You agree to notify us if you become aware of
|
||||||
|
any circumstance which would make any of the foregoing representations inaccurate
|
||||||
|
in any respect. Syllogism Co. may publicly disclose your participation in the project,
|
||||||
|
including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable U.S.
|
||||||
|
Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
x I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions.
|
||||||
|
|
||||||
|
____ I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Chris DuBois |
|
||||||
|
| Company's name (if applicable) | |
|
||||||
|
| Title or Role (if applicable) | |
|
||||||
|
| Date | 2015.10.07 |
|
||||||
|
| GitHub username | chrisdubois |
|
||||||
|
| Website (optional) | |
|
||||||
|
|
|
@ -45,14 +45,14 @@ def main():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
texts = [
|
texts = [
|
||||||
u'Net income was $9.4 million compared to the prior year of $2.7 million.',
|
u'Net income was $9.4 million compared to the prior year of $2.7 million.',
|
||||||
u'Revenue exceeded twelve billion dollars, with a loss of $1b',
|
u'Revenue exceeded twelve billion dollars, with a loss of $1b.',
|
||||||
]
|
]
|
||||||
|
|
||||||
for text in texts:
|
for text in texts:
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
relations = extract_currency_relations(doc)
|
relations = extract_currency_relations(doc)
|
||||||
for r1, r2 in relations:
|
for r1, r2 in relations:
|
||||||
print(r1.text, r2.ent_type_)
|
print(r1.text, r2.ent_type_, r2.text)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -22,73 +22,77 @@ our pattern set stays very small (exact size depends on the maximum length we're
|
||||||
looking for, as the query language currently has no quantifiers)
|
looking for, as the query language currently has no quantifiers)
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function, unicode_literals, division
|
from __future__ import print_function, unicode_literals, division
|
||||||
|
from ast import literal_eval
|
||||||
|
from bz2 import BZ2File
|
||||||
|
import time
|
||||||
|
import math
|
||||||
|
import codecs
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
|
|
||||||
from preshed.maps import PreshMap
|
from preshed.maps import PreshMap
|
||||||
|
from preshed.counter import PreshCounter
|
||||||
from spacy.strings import hash_string
|
from spacy.strings import hash_string
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import PhraseMatcher
|
||||||
|
|
||||||
from spacy.attrs import FLAG63 as U_ENT
|
|
||||||
from spacy.attrs import FLAG62 as L_ENT
|
|
||||||
from spacy.attrs import FLAG61 as I_ENT
|
|
||||||
from spacy.attrs import FLAG60 as B_ENT
|
|
||||||
|
|
||||||
|
|
||||||
def get_bilou(length):
|
def read_gazetteer(tokenizer, loc, n=-1):
|
||||||
if length == 1:
|
for i, line in enumerate(open(loc)):
|
||||||
return [U_ENT]
|
phrase = literal_eval('u' + line.strip())
|
||||||
else:
|
if ' (' in phrase and phrase.endswith(')'):
|
||||||
return [B_ENT] + [I_ENT] * (length - 2) + [L_ENT]
|
phrase = phrase.split(' (', 1)[0]
|
||||||
|
if i >= n:
|
||||||
|
break
|
||||||
|
phrase = tokenizer(phrase)
|
||||||
|
if all((t.is_lower and t.prob >= -10) for t in phrase):
|
||||||
|
continue
|
||||||
|
if len(phrase) >= 2:
|
||||||
|
yield phrase
|
||||||
|
|
||||||
|
|
||||||
def make_matcher(vocab, max_length):
|
def read_text(bz2_loc):
|
||||||
abstract_patterns = []
|
with BZ2File(bz2_loc) as file_:
|
||||||
for length in range(1, max_length+1):
|
for line in file_:
|
||||||
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
|
yield line.decode('utf8')
|
||||||
return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)})
|
|
||||||
|
|
||||||
|
|
||||||
def get_matches(matcher, pattern_ids, doc):
|
def get_matches(tokenizer, phrases, texts, max_length=6):
|
||||||
matches = []
|
matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
|
||||||
for label, start, end in matcher(doc):
|
print("Match")
|
||||||
candidate = doc[start : end]
|
for text in texts:
|
||||||
if pattern_ids[hash_string(candidate.text)] == True:
|
doc = tokenizer(text)
|
||||||
start = candidate[0].idx
|
matches = matcher(doc)
|
||||||
end = candidate[-1].idx + len(candidate[-1])
|
for mwe in doc.ents:
|
||||||
matches.append((start, end, candidate.root.tag_, candidate.text))
|
yield mwe
|
||||||
return matches
|
|
||||||
|
|
||||||
|
|
||||||
def merge_matches(doc, matches):
|
def main(patterns_loc, text_loc, counts_loc, n=10000000):
|
||||||
for start, end, tag, text in matches:
|
|
||||||
doc.merge(start, end, tag, text, 'MWE')
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
nlp = English(parser=False, tagger=False, entity=False)
|
nlp = English(parser=False, tagger=False, entity=False)
|
||||||
|
print("Make matcher")
|
||||||
gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones']
|
phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
|
||||||
example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.'
|
counts = PreshCounter()
|
||||||
pattern_ids = PreshMap()
|
t1 = time.time()
|
||||||
max_length = 0
|
for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
|
||||||
for pattern_str in gazetteer:
|
counts.inc(hash_string(mwe.text), 1)
|
||||||
pattern = nlp.tokenizer(pattern_str)
|
t2 = time.time()
|
||||||
bilou_tags = get_bilou(len(pattern))
|
print("10m tokens in %d s" % (t2 - t1))
|
||||||
for word, tag in zip(pattern, bilou_tags):
|
|
||||||
lexeme = nlp.vocab[word.orth]
|
|
||||||
lexeme.set_flag(tag, True)
|
|
||||||
pattern_ids[hash_string(pattern.text)] = True
|
|
||||||
max_length = max(max_length, len(pattern))
|
|
||||||
|
|
||||||
matcher = make_matcher(nlp.vocab, max_length)
|
|
||||||
|
|
||||||
doc = nlp(example_text)
|
|
||||||
matches = get_matches(matcher, pattern_ids, doc)
|
|
||||||
merge_matches(doc, matches)
|
|
||||||
for token in doc:
|
|
||||||
print(token.text, token.ent_type_)
|
|
||||||
|
|
||||||
|
with codecs.open(counts_loc, 'w', 'utf8') as file_:
|
||||||
|
for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
|
||||||
|
text = phrase.string
|
||||||
|
key = hash_string(text)
|
||||||
|
count = counts[key]
|
||||||
|
if count != 0:
|
||||||
|
file_.write('%d\t%s\n' % (count, text))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
if False:
|
||||||
|
import cProfile
|
||||||
|
import pstats
|
||||||
|
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
|
||||||
|
s = pstats.Stats("Profile.prof")
|
||||||
|
s.strip_dirs().sort_stats("time").print_stats()
|
||||||
|
else:
|
||||||
|
plac.call(main)
|
||||||
|
|
|
@ -13,5 +13,7 @@
|
||||||
"ADP": {"pos": "ADP"},
|
"ADP": {"pos": "ADP"},
|
||||||
"SYM": {"pos": "SYM"},
|
"SYM": {"pos": "SYM"},
|
||||||
"X": {"pos": "X"},
|
"X": {"pos": "X"},
|
||||||
"INTJ": {"pos": "INTJ"}
|
"INTJ": {"pos": "INTJ"},
|
||||||
|
"DET": {"pos": "DET"},
|
||||||
|
"PART": {"pos": "PART"}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,43 +2,43 @@
|
||||||
"S": {"pos": "NOUN"},
|
"S": {"pos": "NOUN"},
|
||||||
"E": {"pos": "ADP"},
|
"E": {"pos": "ADP"},
|
||||||
"RD": {"pos": "DET"},
|
"RD": {"pos": "DET"},
|
||||||
"V": {"pos": "VER"},
|
"V": {"pos": "VERB"},
|
||||||
"_": {"pos": "_"},
|
"_": {"pos": "NO_TAG"},
|
||||||
"A": {"pos": "ADJ"},
|
"A": {"pos": "ADJ"},
|
||||||
"SP": {"pos": "PROP"},
|
"SP": {"pos": "PROPN"},
|
||||||
"FF": {"pos": "PUNC"},
|
"FF": {"pos": "PUNCT"},
|
||||||
"FS": {"pos": "PUNC"},
|
"FS": {"pos": "PUNCT"},
|
||||||
"B": {"pos": "ADV"},
|
"B": {"pos": "ADV"},
|
||||||
"CC": {"pos": "CON"},
|
"CC": {"pos": "CONJ"},
|
||||||
"FB": {"pos": "PUNC"},
|
"FB": {"pos": "PUNCT"},
|
||||||
"VA": {"pos": "AUX"},
|
"VA": {"pos": "AUX"},
|
||||||
"PC": {"pos": "PRO"},
|
"PC": {"pos": "PRON"},
|
||||||
"N": {"pos": "NUM"},
|
"N": {"pos": "NUM"},
|
||||||
"RI": {"pos": "DET"},
|
"RI": {"pos": "DET"},
|
||||||
"PR": {"pos": "PRO"},
|
"PR": {"pos": "PRON"},
|
||||||
"CS": {"pos": "SCON"},
|
"CS": {"pos": "SCONJ"},
|
||||||
"BN": {"pos": "ADV"},
|
"BN": {"pos": "ADV"},
|
||||||
"AP": {"pos": "DET"},
|
"AP": {"pos": "DET"},
|
||||||
"VM": {"pos": "AUX"},
|
"VM": {"pos": "AUX"},
|
||||||
"DI": {"pos": "DET"},
|
"DI": {"pos": "DET"},
|
||||||
"FC": {"pos": "PUNC"},
|
"FC": {"pos": "PUNCT"},
|
||||||
"PI": {"pos": "PRO"},
|
"PI": {"pos": "PRON"},
|
||||||
"DD": {"pos": "DET"},
|
"DD": {"pos": "DET"},
|
||||||
"DQ": {"pos": "DET"},
|
"DQ": {"pos": "DET"},
|
||||||
"PQ": {"pos": "PRO"},
|
"PQ": {"pos": "PRON"},
|
||||||
"PD": {"pos": "PRO"},
|
"PD": {"pos": "PRON"},
|
||||||
"NO": {"pos": "ADJ"},
|
"NO": {"pos": "ADJ"},
|
||||||
"PE": {"pos": "PRO"},
|
"PE": {"pos": "PRON"},
|
||||||
"T": {"pos": "DET"},
|
"T": {"pos": "DET"},
|
||||||
"X": {"pos": "SYM"},
|
"X": {"pos": "SYM"},
|
||||||
"SW": {"pos": "X"},
|
"SW": {"pos": "X"},
|
||||||
"NO": {"pos": "PRO"},
|
"NO": {"pos": "PRON"},
|
||||||
"I": {"pos": "INT"},
|
"I": {"pos": "INTJ"},
|
||||||
"X": {"pos": "X"},
|
"X": {"pos": "X"},
|
||||||
"DR": {"pos": "DET"},
|
"DR": {"pos": "DET"},
|
||||||
"EA": {"pos": "ADP"},
|
"EA": {"pos": "ADP"},
|
||||||
"PP": {"pos": "PRO"},
|
"PP": {"pos": "PRON"},
|
||||||
"X": {"pos": "NUM"},
|
"X": {"pos": "NUM"},
|
||||||
"DE": {"pos": "DET"},
|
"DE": {"pos": "DET"},
|
||||||
"X": {"pos": "PAR"}
|
"X": {"pos": "PART"}
|
||||||
}
|
}
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -156,8 +156,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
||||||
'spacy.morphology', 'spacy.tagger',
|
'spacy.morphology', 'spacy.tagger',
|
||||||
'spacy.syntax.stateclass',
|
'spacy.syntax.stateclass',
|
||||||
'spacy._ml', 'spacy._theano',
|
'spacy._ml', 'spacy._theano',
|
||||||
'spacy.tokenizer', 'spacy.en.attrs',
|
'spacy.tokenizer',
|
||||||
'spacy.en.pos', 'spacy.syntax.parser',
|
'spacy.syntax.parser',
|
||||||
'spacy.syntax.transition_system',
|
'spacy.syntax.transition_system',
|
||||||
'spacy.syntax.arc_eager',
|
'spacy.syntax.arc_eager',
|
||||||
'spacy.syntax._parse_features',
|
'spacy.syntax._parse_features',
|
||||||
|
|
|
@ -1,64 +0,0 @@
|
||||||
from ..attrs cimport FLAG13, FLAG14
|
|
||||||
from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21
|
|
||||||
from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28
|
|
||||||
from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32
|
|
||||||
from ..attrs cimport IS_ALPHA as _IS_ALPHA
|
|
||||||
from ..attrs cimport IS_DIGIT as _IS_DIGIT
|
|
||||||
from ..attrs cimport IS_ASCII as _IS_ASCII
|
|
||||||
from ..attrs cimport IS_LOWER as _IS_LOWER
|
|
||||||
from ..attrs cimport IS_PUNCT as _IS_PUNCT
|
|
||||||
from ..attrs cimport IS_SPACE as _IS_SPACE
|
|
||||||
from ..attrs cimport IS_TITLE as _IS_TITLE
|
|
||||||
from ..attrs cimport IS_UPPER as _IS_UPPER
|
|
||||||
from ..attrs cimport IS_OOV as _IS_OOV
|
|
||||||
from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL
|
|
||||||
from ..attrs cimport LIKE_URL as _LIKE_URL
|
|
||||||
from ..attrs cimport LIKE_NUM as _LIKE_NUM
|
|
||||||
from ..attrs cimport IS_STOP as _IS_STOP
|
|
||||||
from ..attrs cimport ORTH as _ORTH
|
|
||||||
from ..attrs cimport SHAPE as _SHAPE
|
|
||||||
from ..attrs cimport LOWER as _LOWER
|
|
||||||
from ..attrs cimport NORM as _NORM
|
|
||||||
from ..attrs cimport CLUSTER as _CLUSTER
|
|
||||||
from ..attrs cimport PREFIX as _PREFIX
|
|
||||||
from ..attrs cimport SUFFIX as _SUFFIX
|
|
||||||
from ..attrs cimport LEMMA as _LEMMA
|
|
||||||
from ..attrs cimport POS as _POS
|
|
||||||
from ..attrs cimport TAG as _TAG
|
|
||||||
from ..attrs cimport DEP as _DEP
|
|
||||||
from ..attrs cimport HEAD as _HEAD
|
|
||||||
from ..attrs cimport ENT_IOB as _ENT_IOB
|
|
||||||
from ..attrs cimport ENT_TYPE as _ENT_TYPE
|
|
||||||
from ..attrs cimport SPACY as _SPACY
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
|
||||||
IS_ALPHA = _IS_ALPHA
|
|
||||||
IS_ASCII = _IS_ASCII
|
|
||||||
IS_DIGIT = _IS_DIGIT
|
|
||||||
IS_LOWER = _IS_LOWER
|
|
||||||
IS_PUNCT = _IS_PUNCT
|
|
||||||
IS_SPACE = _IS_SPACE
|
|
||||||
IS_TITLE = _IS_TITLE
|
|
||||||
IS_UPPER = _IS_UPPER
|
|
||||||
LIKE_URL = _LIKE_URL
|
|
||||||
LIKE_NUM = _LIKE_NUM
|
|
||||||
LIKE_EMAIL = _LIKE_EMAIL
|
|
||||||
IS_STOP = _IS_STOP
|
|
||||||
IS_OOV = _IS_OOV
|
|
||||||
|
|
||||||
ORTH = _ORTH
|
|
||||||
SHAPE = _SHAPE
|
|
||||||
LOWER = _LOWER
|
|
||||||
NORM = _NORM
|
|
||||||
PREFIX = _PREFIX
|
|
||||||
SUFFIX = _SUFFIX
|
|
||||||
CLUSTER = _CLUSTER
|
|
||||||
LEMMA = _LEMMA
|
|
||||||
POS = _POS
|
|
||||||
TAG = _TAG
|
|
||||||
DEP = _DEP
|
|
||||||
ENT_IOB = _ENT_IOB
|
|
||||||
ENT_TYPE = _ENT_TYPE
|
|
||||||
HEAD = _HEAD
|
|
||||||
SPACY = _SPACY
|
|
|
@ -1,21 +0,0 @@
|
||||||
# cython: embedsignature=True
|
|
||||||
from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
|
|
||||||
from ..orth cimport is_title, is_upper, like_url, like_number, like_email
|
|
||||||
from ..typedefs cimport flags_t
|
|
||||||
|
|
||||||
|
|
||||||
def get_flags(unicode string, is_oov=False):
|
|
||||||
cdef flags_t flags = 0
|
|
||||||
flags |= is_oov << IS_OOV
|
|
||||||
flags |= is_alpha(string) << IS_ALPHA
|
|
||||||
flags |= is_ascii(string) << IS_ASCII
|
|
||||||
flags |= is_digit(string) << IS_DIGIT
|
|
||||||
flags |= is_lower(string) << IS_LOWER
|
|
||||||
flags |= is_punct(string) << IS_PUNCT
|
|
||||||
flags |= is_space(string) << IS_SPACE
|
|
||||||
flags |= is_title(string) << IS_TITLE
|
|
||||||
flags |= is_upper(string) << IS_UPPER
|
|
||||||
flags |= like_url(string) << LIKE_URL
|
|
||||||
flags |= like_number(string) << LIKE_NUM
|
|
||||||
flags |= like_email(string) << LIKE_EMAIL
|
|
||||||
return flags
|
|
|
@ -1,5 +0,0 @@
|
||||||
from ..tagger cimport Tagger
|
|
||||||
|
|
||||||
|
|
||||||
cdef class EnPosTagger(Tagger):
|
|
||||||
pass
|
|
|
@ -1,11 +0,0 @@
|
||||||
from os import path
|
|
||||||
|
|
||||||
from ..parts_of_speech cimport NOUN, VERB, ADJ
|
|
||||||
|
|
||||||
from ..lemmatizer import Lemmatizer
|
|
||||||
|
|
||||||
|
|
||||||
cdef class EnPosTagger(Tagger):
|
|
||||||
"""A part-of-speech tagger for English"""
|
|
||||||
def make_lemmatizer(self, data_dir):
|
|
||||||
return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
|
|
@ -1,11 +1,18 @@
|
||||||
|
# cython: profile=True
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
|
from .typedefs cimport hash_t
|
||||||
from .attrs cimport attr_id_t
|
from .attrs cimport attr_id_t
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC, LexemeC
|
||||||
|
from .lexeme cimport Lexeme
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
||||||
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
||||||
|
@ -15,6 +22,38 @@ from .vocab cimport Vocab
|
||||||
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
from .attrs import FLAG61 as U_ENT
|
||||||
|
|
||||||
|
from .attrs import FLAG60 as B2_ENT
|
||||||
|
from .attrs import FLAG59 as B3_ENT
|
||||||
|
from .attrs import FLAG58 as B4_ENT
|
||||||
|
from .attrs import FLAG57 as B5_ENT
|
||||||
|
from .attrs import FLAG56 as B6_ENT
|
||||||
|
from .attrs import FLAG55 as B7_ENT
|
||||||
|
from .attrs import FLAG54 as B8_ENT
|
||||||
|
from .attrs import FLAG53 as B9_ENT
|
||||||
|
from .attrs import FLAG52 as B10_ENT
|
||||||
|
|
||||||
|
from .attrs import FLAG51 as I3_ENT
|
||||||
|
from .attrs import FLAG50 as I4_ENT
|
||||||
|
from .attrs import FLAG49 as I5_ENT
|
||||||
|
from .attrs import FLAG48 as I6_ENT
|
||||||
|
from .attrs import FLAG47 as I7_ENT
|
||||||
|
from .attrs import FLAG46 as I8_ENT
|
||||||
|
from .attrs import FLAG45 as I9_ENT
|
||||||
|
from .attrs import FLAG44 as I10_ENT
|
||||||
|
|
||||||
|
from .attrs import FLAG43 as L2_ENT
|
||||||
|
from .attrs import FLAG42 as L3_ENT
|
||||||
|
from .attrs import FLAG41 as L4_ENT
|
||||||
|
from .attrs import FLAG40 as L5_ENT
|
||||||
|
from .attrs import FLAG39 as L6_ENT
|
||||||
|
from .attrs import FLAG38 as L7_ENT
|
||||||
|
from .attrs import FLAG37 as L8_ENT
|
||||||
|
from .attrs import FLAG36 as L9_ENT
|
||||||
|
from .attrs import FLAG35 as L10_ENT
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ujson as json
|
import ujson as json
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -41,7 +80,7 @@ cdef Pattern* init_pattern(Pool mem, object token_specs, attr_t entity_type) exc
|
||||||
pattern[i].spec[j].attr = attr
|
pattern[i].spec[j].attr = attr
|
||||||
pattern[i].spec[j].value = value
|
pattern[i].spec[j].value = value
|
||||||
i = len(token_specs)
|
i = len(token_specs)
|
||||||
pattern[i].spec = <AttrValue*>mem.alloc(1, sizeof(AttrValue))
|
pattern[i].spec = <AttrValue*>mem.alloc(2, sizeof(AttrValue))
|
||||||
pattern[i].spec[0].attr = ENT_TYPE
|
pattern[i].spec[0].attr = ENT_TYPE
|
||||||
pattern[i].spec[0].value = entity_type
|
pattern[i].spec[0].value = entity_type
|
||||||
pattern[i].spec[1].attr = LENGTH
|
pattern[i].spec[1].attr = LENGTH
|
||||||
|
@ -81,7 +120,33 @@ def _convert_strings(token_specs, string_store):
|
||||||
value = int(value)
|
value = int(value)
|
||||||
converted[-1].append((attr, value))
|
converted[-1].append((attr, value))
|
||||||
return converted
|
return converted
|
||||||
|
|
||||||
|
|
||||||
|
def get_bilou(length):
|
||||||
|
if length == 1:
|
||||||
|
return [U_ENT]
|
||||||
|
elif length == 2:
|
||||||
|
return [B2_ENT, L2_ENT]
|
||||||
|
elif length == 3:
|
||||||
|
return [B3_ENT, I3_ENT, L3_ENT]
|
||||||
|
elif length == 4:
|
||||||
|
return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
|
||||||
|
elif length == 5:
|
||||||
|
return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
|
||||||
|
elif length == 6:
|
||||||
|
return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
|
||||||
|
elif length == 7:
|
||||||
|
return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
|
||||||
|
elif length == 8:
|
||||||
|
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
|
||||||
|
elif length == 9:
|
||||||
|
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
|
||||||
|
elif length == 10:
|
||||||
|
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
||||||
|
I10_ENT, I10_ENT, L10_ENT]
|
||||||
|
else:
|
||||||
|
raise ValueError("Max length currently 10 for phrase matching")
|
||||||
|
|
||||||
|
|
||||||
def map_attr_name(attr):
|
def map_attr_name(attr):
|
||||||
attr = attr.upper()
|
attr = attr.upper()
|
||||||
|
@ -95,32 +160,6 @@ def map_attr_name(attr):
|
||||||
return SHAPE
|
return SHAPE
|
||||||
elif attr == 'NORM':
|
elif attr == 'NORM':
|
||||||
return NORM
|
return NORM
|
||||||
elif attr == 'FLAG13':
|
|
||||||
return FLAG13
|
|
||||||
elif attr == 'FLAG14':
|
|
||||||
return FLAG14
|
|
||||||
elif attr == 'FLAG15':
|
|
||||||
return FLAG15
|
|
||||||
elif attr == 'FLAG16':
|
|
||||||
return FLAG16
|
|
||||||
elif attr == 'FLAG17':
|
|
||||||
return FLAG17
|
|
||||||
elif attr == 'FLAG18':
|
|
||||||
return FLAG18
|
|
||||||
elif attr == 'FLAG19':
|
|
||||||
return FLAG19
|
|
||||||
elif attr == 'FLAG20':
|
|
||||||
return FLAG20
|
|
||||||
elif attr == 'FLAG21':
|
|
||||||
return FLAG21
|
|
||||||
elif attr == 'FLAG22':
|
|
||||||
return FLAG22
|
|
||||||
elif attr == 'FLAG23':
|
|
||||||
return FLAG23
|
|
||||||
elif attr == 'FLAG24':
|
|
||||||
return FLAG24
|
|
||||||
elif attr == 'FLAG25':
|
|
||||||
return FLAG25
|
|
||||||
else:
|
else:
|
||||||
raise Exception("TODO: Finish supporting attr mapping %s" % attr)
|
raise Exception("TODO: Finish supporting attr mapping %s" % attr)
|
||||||
|
|
||||||
|
@ -163,7 +202,7 @@ cdef class Matcher:
|
||||||
spec = _convert_strings(spec, self.vocab.strings)
|
spec = _convert_strings(spec, self.vocab.strings)
|
||||||
self.patterns.push_back(init_pattern(self.mem, spec, etype))
|
self.patterns.push_back(init_pattern(self.mem, spec, etype))
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, Doc doc, acceptor=None):
|
||||||
cdef vector[Pattern*] partials
|
cdef vector[Pattern*] partials
|
||||||
cdef int n_partials = 0
|
cdef int n_partials = 0
|
||||||
cdef int q = 0
|
cdef int q = 0
|
||||||
|
@ -174,21 +213,94 @@ cdef class Matcher:
|
||||||
for token_i in range(doc.length):
|
for token_i in range(doc.length):
|
||||||
token = &doc.data[token_i]
|
token = &doc.data[token_i]
|
||||||
q = 0
|
q = 0
|
||||||
|
# Go over the open matches, extending or finalizing if able. Otherwise,
|
||||||
|
# we over-write them (q doesn't advance)
|
||||||
for i in range(partials.size()):
|
for i in range(partials.size()):
|
||||||
state = partials.at(i)
|
state = partials.at(i)
|
||||||
if match(state, token):
|
if match(state, token):
|
||||||
if is_final(state):
|
if is_final(state):
|
||||||
matches.append(get_entity(state, token, token_i))
|
label, start, end = get_entity(state, token, token_i)
|
||||||
|
if acceptor is None or acceptor(doc, label, start, end):
|
||||||
|
matches.append((label, start, end))
|
||||||
else:
|
else:
|
||||||
partials[q] = state + 1
|
partials[q] = state + 1
|
||||||
q += 1
|
q += 1
|
||||||
partials.resize(q)
|
partials.resize(q)
|
||||||
|
# Check whether we open any new patterns on this token
|
||||||
for i in range(self.n_patterns):
|
for i in range(self.n_patterns):
|
||||||
state = self.patterns[i]
|
state = self.patterns[i]
|
||||||
if match(state, token):
|
if match(state, token):
|
||||||
if is_final(state):
|
if is_final(state):
|
||||||
matches.append(get_entity(state, token, token_i))
|
label, start, end = get_entity(state, token, token_i)
|
||||||
|
if acceptor is None or acceptor(doc, label, start, end):
|
||||||
|
matches.append((label, start, end))
|
||||||
else:
|
else:
|
||||||
partials.push_back(state + 1)
|
partials.push_back(state + 1)
|
||||||
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches
|
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
|
||||||
|
cdef class PhraseMatcher:
|
||||||
|
cdef Pool mem
|
||||||
|
cdef Vocab vocab
|
||||||
|
cdef Matcher matcher
|
||||||
|
cdef PreshMap phrase_ids
|
||||||
|
|
||||||
|
cdef int max_length
|
||||||
|
cdef attr_t* _phrase_key
|
||||||
|
|
||||||
|
def __init__(self, Vocab vocab, phrases, max_length=10):
|
||||||
|
self.mem = Pool()
|
||||||
|
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
|
||||||
|
self.max_length = max_length
|
||||||
|
self.vocab = vocab
|
||||||
|
self.matcher = Matcher(self.vocab, {})
|
||||||
|
self.phrase_ids = PreshMap()
|
||||||
|
for phrase in phrases:
|
||||||
|
if len(phrase) < max_length:
|
||||||
|
self.add(phrase)
|
||||||
|
|
||||||
|
abstract_patterns = []
|
||||||
|
for length in range(1, max_length):
|
||||||
|
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
|
||||||
|
self.matcher.add('Candidate', 'MWE', {}, abstract_patterns)
|
||||||
|
|
||||||
|
def add(self, Doc tokens):
|
||||||
|
cdef int length = tokens.length
|
||||||
|
assert length < self.max_length
|
||||||
|
tags = get_bilou(length)
|
||||||
|
assert len(tags) == length, length
|
||||||
|
|
||||||
|
cdef int i
|
||||||
|
for i in range(self.max_length):
|
||||||
|
self._phrase_key[i] = 0
|
||||||
|
for i, tag in enumerate(tags):
|
||||||
|
lexeme = self.vocab[tokens.data[i].lex.orth]
|
||||||
|
lexeme.set_flag(tag, True)
|
||||||
|
self._phrase_key[i] = lexeme.orth
|
||||||
|
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
||||||
|
self.phrase_ids[key] = True
|
||||||
|
|
||||||
|
def __call__(self, Doc doc):
|
||||||
|
matches = []
|
||||||
|
for label, start, end in self.matcher(doc, acceptor=self.accept_match):
|
||||||
|
cand = doc[start : end]
|
||||||
|
start = cand[0].idx
|
||||||
|
end = cand[-1].idx + len(cand[-1])
|
||||||
|
matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
|
||||||
|
for match in matches:
|
||||||
|
doc.merge(*match)
|
||||||
|
return matches
|
||||||
|
|
||||||
|
def accept_match(self, Doc doc, int label, int start, int end):
|
||||||
|
assert (end - start) < self.max_length
|
||||||
|
cdef int i, j
|
||||||
|
for i in range(self.max_length):
|
||||||
|
self._phrase_key[i] = 0
|
||||||
|
for i, j in enumerate(range(start, end)):
|
||||||
|
self._phrase_key[i] = doc.data[j].lex.orth
|
||||||
|
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
||||||
|
if self.phrase_ids.get(key):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
|
@ -31,10 +31,7 @@ cdef class Morphology:
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||||
cdef int tag_id
|
cdef int tag_id
|
||||||
if isinstance(tag, basestring):
|
if isinstance(tag, basestring):
|
||||||
try:
|
tag_id = self.reverse_index[self.strings[tag]]
|
||||||
tag_id = self.reverse_index[self.strings[tag]]
|
|
||||||
except KeyError:
|
|
||||||
raise
|
|
||||||
else:
|
else:
|
||||||
tag_id = tag
|
tag_id = tag
|
||||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||||
|
|
|
@ -11,6 +11,7 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from text_unidecode import unidecode
|
from text_unidecode import unidecode
|
||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
@ -165,7 +166,7 @@ cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0):
|
||||||
|
|
||||||
|
|
||||||
cpdef bytes asciied(unicode string):
|
cpdef bytes asciied(unicode string):
|
||||||
cdef str stripped = unidecode(string)
|
stripped = unidecode(string)
|
||||||
if not stripped:
|
if not stripped:
|
||||||
return b'???'
|
return b'???'
|
||||||
return stripped.encode('ascii')
|
return stripped.encode('ascii')
|
||||||
|
|
|
@ -96,7 +96,9 @@ cdef class Vocab:
|
||||||
lex = <LexemeC*>self._by_hash.get(key)
|
lex = <LexemeC*>self._by_hash.get(key)
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
assert lex.orth == self.strings[string]
|
if lex.orth != self.strings[string]:
|
||||||
|
raise LookupError.mismatched_strings(
|
||||||
|
lex.orth, self.strings[lex.orth], string)
|
||||||
return lex
|
return lex
|
||||||
else:
|
else:
|
||||||
return self._new_lexeme(mem, string)
|
return self._new_lexeme(mem, string)
|
||||||
|
@ -352,6 +354,21 @@ def write_binary_vectors(in_loc, out_loc):
|
||||||
out_file.write_from(vec, vec_len, sizeof(float))
|
out_file.write_from(vec, vec_len, sizeof(float))
|
||||||
|
|
||||||
|
|
||||||
|
class LookupError(Exception):
|
||||||
|
@classmethod
|
||||||
|
def mismatched_strings(cls, id_, id_string, original_string):
|
||||||
|
return cls(
|
||||||
|
"Error fetching a Lexeme from the Vocab. When looking up a string, "
|
||||||
|
"the lexeme returned had an orth ID that did not match the query string. "
|
||||||
|
"This means that the cached lexeme structs are mismatched to the "
|
||||||
|
"string encoding table. The mismatched:\n"
|
||||||
|
"Query string: {query}\n"
|
||||||
|
"Orth cached: {orth_str}\n"
|
||||||
|
"ID of orth: {orth_id}".format(
|
||||||
|
query=original_string, orth_str=id_string, orth_id=id_)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class VectorReadError(Exception):
|
class VectorReadError(Exception):
|
||||||
@classmethod
|
@classmethod
|
||||||
def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):
|
def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):
|
||||||
|
|
|
@ -3,6 +3,7 @@ import pytest
|
||||||
|
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_overlap_issue118(EN):
|
def test_overlap_issue118(EN):
|
||||||
'''Test a bug that arose from having overlapping matches'''
|
'''Test a bug that arose from having overlapping matches'''
|
||||||
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.en import attrs
|
from spacy import attrs
|
||||||
|
|
||||||
|
|
||||||
def test_attr_of_token(EN):
|
def test_attr_of_token(EN):
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
from spacy.en.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT
|
from spacy.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT
|
||||||
from spacy.en.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM
|
from spacy.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM
|
||||||
from spacy.en.attrs import IS_STOP
|
from spacy.attrs import IS_STOP
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.en.attrs import *
|
from spacy.attrs import *
|
||||||
|
|
||||||
|
|
||||||
def test_is_alpha(en_vocab):
|
def test_is_alpha(en_vocab):
|
||||||
|
|
|
@ -26,6 +26,7 @@ def test_main_entry_point(nlp):
|
||||||
doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_sentence_spans(nlp):
|
def test_sentence_spans(nlp):
|
||||||
# from spacy.en import English
|
# from spacy.en import English
|
||||||
# nlp = English()
|
# nlp = English()
|
||||||
|
@ -33,6 +34,7 @@ def test_sentence_spans(nlp):
|
||||||
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_entity_spans(nlp):
|
def test_entity_spans(nlp):
|
||||||
# from spacy.en import English
|
# from spacy.en import English
|
||||||
# nlp = English()
|
# nlp = English()
|
||||||
|
@ -44,6 +46,7 @@ def test_entity_spans(nlp):
|
||||||
assert ents[0].string == ents[0].string
|
assert ents[0].string == ents[0].string
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_noun_chunk_spans(nlp):
|
def test_noun_chunk_spans(nlp):
|
||||||
# from spacy.en import English
|
# from spacy.en import English
|
||||||
# nlp = English()
|
# nlp = English()
|
||||||
|
@ -56,11 +59,12 @@ def test_noun_chunk_spans(nlp):
|
||||||
# NP three noun chunks <-- has
|
# NP three noun chunks <-- has
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_count_by(nlp):
|
def test_count_by(nlp):
|
||||||
# from spacy.en import English, attrs
|
# from spacy.en import English, attrs
|
||||||
# nlp = English()
|
# nlp = English()
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.en import attrs
|
from spacy import attrs
|
||||||
tokens = nlp('apple apple orange banana')
|
tokens = nlp('apple apple orange banana')
|
||||||
assert tokens.count_by(attrs.ORTH) == {2529: 2, 4117: 1, 6650: 1}
|
assert tokens.count_by(attrs.ORTH) == {2529: 2, 4117: 1, 6650: 1}
|
||||||
assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[2529],
|
assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[2529],
|
||||||
|
@ -88,6 +92,7 @@ def test_token_span(doc):
|
||||||
assert token.i == 4
|
assert token.i == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_example_i_like_new_york1(nlp):
|
def test_example_i_like_new_york1(nlp):
|
||||||
toks = nlp('I like New York in Autumn.')
|
toks = nlp('I like New York in Autumn.')
|
||||||
|
|
||||||
|
@ -127,16 +132,19 @@ def dot(toks):
|
||||||
return tok(toks, "dot")
|
return tok(toks, "dot")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_example_i_like_new_york3(toks, new, york):
|
def test_example_i_like_new_york3(toks, new, york):
|
||||||
assert toks[new].head.orth_ == 'York'
|
assert toks[new].head.orth_ == 'York'
|
||||||
assert toks[york].head.orth_ == 'like'
|
assert toks[york].head.orth_ == 'like'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_example_i_like_new_york4(toks, new, york):
|
def test_example_i_like_new_york4(toks, new, york):
|
||||||
new_york = toks[new:york+1]
|
new_york = toks[new:york+1]
|
||||||
assert new_york.root.orth_ == 'York'
|
assert new_york.root.orth_ == 'York'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_example_i_like_new_york5(toks, autumn, dot):
|
def test_example_i_like_new_york5(toks, autumn, dot):
|
||||||
assert toks[autumn].head.orth_ == 'in'
|
assert toks[autumn].head.orth_ == 'in'
|
||||||
assert toks[dot].head.orth_ == 'like'
|
assert toks[dot].head.orth_ == 'like'
|
||||||
|
@ -144,6 +152,7 @@ def test_example_i_like_new_york5(toks, autumn, dot):
|
||||||
assert autumn_dot.root.orth_ == 'Autumn'
|
assert autumn_dot.root.orth_ == 'Autumn'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_navigating_the_parse_tree_lefts(doc):
|
def test_navigating_the_parse_tree_lefts(doc):
|
||||||
# TODO: where does the span object come from?
|
# TODO: where does the span object come from?
|
||||||
span = doc[:2]
|
span = doc[:2]
|
||||||
|
@ -151,6 +160,7 @@ def test_navigating_the_parse_tree_lefts(doc):
|
||||||
if span.doc[i].head in span]
|
if span.doc[i].head in span]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_navigating_the_parse_tree_rights(doc):
|
def test_navigating_the_parse_tree_rights(doc):
|
||||||
span = doc[:2]
|
span = doc[:2]
|
||||||
rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
import spacy.en
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
|
@ -22,6 +22,7 @@ def test_get_tokens_and_sentences(doc):
|
||||||
assert sentence.text == 'Hello, world.'
|
assert sentence.text == 'Hello, world.'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_use_integer_ids_for_any_strings(nlp, token):
|
def test_use_integer_ids_for_any_strings(nlp, token):
|
||||||
hello_id = nlp.vocab.strings['Hello']
|
hello_id = nlp.vocab.strings['Hello']
|
||||||
hello_str = nlp.vocab.strings[hello_id]
|
hello_str = nlp.vocab.strings[hello_id]
|
||||||
|
@ -45,7 +46,7 @@ def test_get_and_set_string_views_and_flags(nlp, token):
|
||||||
|
|
||||||
|
|
||||||
def test_export_to_numpy_arrays(nlp, doc):
|
def test_export_to_numpy_arrays(nlp, doc):
|
||||||
from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
|
from spacy.attrs import ORTH, LIKE_URL, IS_OOV
|
||||||
|
|
||||||
attr_ids = [ORTH, LIKE_URL, IS_OOV]
|
attr_ids = [ORTH, LIKE_URL, IS_OOV]
|
||||||
doc_array = doc.to_array(attr_ids)
|
doc_array = doc.to_array(attr_ids)
|
||||||
|
@ -68,6 +69,7 @@ def test_word_vectors(nlp):
|
||||||
assert apples.similarity(oranges) > boots.similarity(hippos)
|
assert apples.similarity(oranges) > boots.similarity(hippos)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_part_of_speech_tags(nlp):
|
def test_part_of_speech_tags(nlp):
|
||||||
from spacy.parts_of_speech import ADV
|
from spacy.parts_of_speech import ADV
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ include ./meta.jade
|
||||||
|
|
||||||
p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses.
|
p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses.
|
||||||
|
|
||||||
p It doesn't always guess right, but we can tell how often it does, and we can think of ways t help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit.
|
p It doesn't always guess right, but we can tell how often it does, and we can think of ways to help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit.
|
||||||
|
|
||||||
p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.)
|
p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user