Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Yubing (Tom) Dong 2015-10-08 22:53:02 -07:00
commit 9a6811acc4
24 changed files with 534 additions and 241 deletions

View File

@ -24,4 +24,4 @@ install:
# run tests # run tests
script: script:
- "py.test tests/ website/tests/ -x" - "py.test tests/ -x"

View File

@ -148,8 +148,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
nlp.end_training(model_dir) nlp.end_training(model_dir)
print('done') print('done')
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None): beam_width=None, cand_preproc=None):
nlp = Language(data_dir=model_dir) nlp = Language(data_dir=model_dir)
if beam_width is not None: if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width nlp.parser.cfg.beam_width = beam_width
@ -166,16 +167,14 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False
nlp.entity(tokens) nlp.entity(tokens)
nlp.parser(tokens) nlp.parser(tokens)
else: else:
tokens = nlp(raw_text, merge_mwes=False) tokens = nlp(raw_text)
gold = GoldParse(tokens, annot_tuples) gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose) scorer.score(tokens, gold, verbose=verbose)
return scorer return scorer
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): def write_parses(Language, dev_loc, model_dir, out_loc):
nlp = Language(data_dir=model_dir) nlp = Language(data_dir=model_dir)
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
gold_tuples = read_json_file(dev_loc) gold_tuples = read_json_file(dev_loc)
scorer = Scorer() scorer = Scorer()
out_file = codecs.open(out_loc, 'w', 'utf8') out_file = codecs.open(out_loc, 'w', 'utf8')
@ -188,14 +187,16 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
nlp.entity(tokens) nlp.entity(tokens)
nlp.parser(tokens) nlp.parser(tokens)
else: else:
tokens = nlp(raw_text, merge_mwes=False) tokens = nlp(raw_text)
gold = GoldParse(tokens, annot_tuples) #gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=False) #scorer.score(tokens, gold, verbose=False)
for t in tokens: for sent in tokens.sents:
out_file.write( for t in sent:
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) if not t.is_space:
) out_file.write(
return scorer '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
)
out_file.write('\n')
@plac.annotations( @plac.annotations(
@ -220,14 +221,15 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
gold_preproc=gold_preproc, n_sents=n_sents, gold_preproc=gold_preproc, n_sents=n_sents,
corruption_level=corruption_level, n_iter=n_iter, corruption_level=corruption_level, n_iter=n_iter,
verbose=verbose) verbose=verbose)
#if out_loc: if out_loc:
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) write_parses(English, dev_loc, model_dir, out_loc)
scorer = evaluate(English, list(read_json_file(dev_loc)), scorer = evaluate(English, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose) model_dir, gold_preproc=gold_preproc, verbose=verbose)
print('TOK', scorer.token_acc) print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc) print('POS', scorer.tags_acc)
print('UAS', scorer.uas) print('UAS', scorer.uas)
print('LAS', scorer.las) print('LAS', scorer.las)
print('SBD', scorer.sbd_acc)
print('NER P', scorer.ents_p) print('NER P', scorer.ents_p)
print('NER R', scorer.ents_r) print('NER R', scorer.ents_r)

151
bin/parser/train_ud.py Normal file
View File

@ -0,0 +1,151 @@
import plac
import json
from os import path
import shutil
import os
import random
from spacy.syntax.util import Config
from spacy.gold import GoldParse
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.syntax.parser import Parser
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.parser import get_templates
from spacy.scorer import Scorer
from spacy.language import Language
from spacy.tagger import W_orth
TAGGER_TEMPLATES = (
(W_orth,),
)
try:
from codecs import open
except ImportError:
pass
class TreebankParser(object):
@staticmethod
def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=labels)
@classmethod
def from_dir(cls, tag_map, model_dir):
vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
tokenizer = Tokenizer(vocab, {}, None, None, None)
tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)
cfg = Config.read(path.join(model_dir, 'deps'), 'config')
parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
return cls(vocab, tokenizer, tagger, parser)
def __init__(self, vocab, tokenizer, tagger, parser):
self.vocab = vocab
self.tokenizer = tokenizer
self.tagger = tagger
self.parser = parser
def train(self, words, tags, heads, deps):
tokens = self.tokenizer.tokens_from_list(list(words))
self.tagger.train(tokens, tags)
tokens = self.tokenizer.tokens_from_list(list(words))
ids = range(len(words))
ner = ['O'] * len(words)
gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)),
make_projective=False)
self.tagger(tokens)
if gold.is_projective:
try:
self.parser.train(tokens, gold)
except:
for id_, word, head, dep in zip(ids, words, heads, deps):
print(id_, word, head, dep)
raise
def __call__(self, words, tags=None):
tokens = self.tokenizer.tokens_from_list(list(words))
if tags is None:
self.tagger(tokens)
else:
self.tagger.tag_from_strings(tokens, tags)
self.parser(tokens)
return tokens
def end_training(self, data_dir):
self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
def read_conllx(loc):
with open(loc, 'r', 'utf8') as file_:
text = file_.read()
for sent in text.strip().split('\n\n'):
lines = sent.strip().split('\n')
if lines:
if lines[0].startswith('#'):
lines.pop(0)
tokens = []
for line in lines:
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
if '-' in id_:
continue
id_ = int(id_) - 1
head = (int(head) - 1) if head != '0' else id_
dep = 'ROOT' if dep == 'root' else dep
tokens.append((id_, word, tag, head, dep, 'O'))
tuples = zip(*tokens)
yield (None, [(tuples, [])])
def score_model(nlp, gold_docs, verbose=False):
scorer = Scorer()
for _, gold_doc in gold_docs:
for annot_tuples, _ in gold_doc:
tokens = nlp(list(annot_tuples[1]), tags=list(annot_tuples[2]))
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
return scorer
def main(train_loc, dev_loc, model_dir, tag_map_loc):
with open(tag_map_loc) as file_:
tag_map = json.loads(file_.read())
train_sents = list(read_conllx(train_loc))
labels = ArcEager.get_labels(train_sents)
templates = get_templates('basic')
TreebankParser.setup_model_dir(model_dir, labels, templates)
nlp = TreebankParser.from_dir(tag_map, model_dir)
for itn in range(15):
for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
nlp.train(words, tags, heads, deps)
random.shuffle(train_sents)
scorer = score_model(nlp, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
nlp.end_training(model_dir)
scorer = score_model(nlp, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
if __name__ == '__main__':
plac.call(main)

View File

@ -0,0 +1,95 @@
Syllogism Contributor Agreement
===============================
This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
Agreement. The SCA applies to any contribution that you make to any product or
project managed by us (the “project”), and sets out the intellectual property
rights you grant to us in the contributed materials. The term “us” shall mean
Syllogism Co. The term "you" shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested below
and include the filled-in version with your first pull-request, under the file
contrbutors/. The name of the file should be your GitHub username, with the
extension .md. For example, the user example_user would create the file
spaCy/contributors/example_user.md .
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
1. The term 'contribution' or contributed materials means any source code,
object code, patch, tool, sample, graphic, specification, manual, documentation,
or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and registrations,
in your contribution:
* you hereby assign to us joint ownership, and to the extent that such assignment
is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual,
irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license
to exercise all rights under those copyrights. This includes, at our option, the
right to sublicense these same rights to third parties through multiple levels of
sublicensees or other licensing arrangements;
* you agree that each of us can do all things in relation to your contribution
as if each of us were the sole owners, and if one of us makes a derivative work
of your contribution, the one who makes the derivative work (or has it made) will
be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution against
us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and exercise
all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the consent
of, pay or render an accounting to the other for any use or distribution of your
contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive,
worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer your
contribution in whole or in part, alone or in combination with
or included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through multiple
levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective on
the date you first submitted a contribution to us, even if your submission took
place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of authorship
and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any third
party's copyrights, trademarks, patents, or other intellectual property rights; and
* each contribution shall be in compliance with U.S. export control laws and other
applicable export and import laws. You agree to notify us if you become aware of
any circumstance which would make any of the foregoing representations inaccurate
in any respect. Syllogism Co. may publicly disclose your participation in the project,
including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable U.S.
Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
x I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions.
____ I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Chris DuBois |
| Company's name (if applicable) | |
| Title or Role (if applicable) | |
| Date | 2015.10.07 |
| GitHub username | chrisdubois |
| Website (optional) | |

View File

@ -45,14 +45,14 @@ def main():
nlp = English() nlp = English()
texts = [ texts = [
u'Net income was $9.4 million compared to the prior year of $2.7 million.', u'Net income was $9.4 million compared to the prior year of $2.7 million.',
u'Revenue exceeded twelve billion dollars, with a loss of $1b', u'Revenue exceeded twelve billion dollars, with a loss of $1b.',
] ]
for text in texts: for text in texts:
doc = nlp(text) doc = nlp(text)
relations = extract_currency_relations(doc) relations = extract_currency_relations(doc)
for r1, r2 in relations: for r1, r2 in relations:
print(r1.text, r2.ent_type_) print(r1.text, r2.ent_type_, r2.text)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -22,73 +22,77 @@ our pattern set stays very small (exact size depends on the maximum length we're
looking for, as the query language currently has no quantifiers) looking for, as the query language currently has no quantifiers)
""" """
from __future__ import print_function, unicode_literals, division from __future__ import print_function, unicode_literals, division
from ast import literal_eval
from bz2 import BZ2File
import time
import math
import codecs
import plac import plac
from preshed.maps import PreshMap from preshed.maps import PreshMap
from preshed.counter import PreshCounter
from spacy.strings import hash_string from spacy.strings import hash_string
from spacy.en import English from spacy.en import English
from spacy.matcher import Matcher from spacy.matcher import PhraseMatcher
from spacy.attrs import FLAG63 as U_ENT
from spacy.attrs import FLAG62 as L_ENT
from spacy.attrs import FLAG61 as I_ENT
from spacy.attrs import FLAG60 as B_ENT
def get_bilou(length): def read_gazetteer(tokenizer, loc, n=-1):
if length == 1: for i, line in enumerate(open(loc)):
return [U_ENT] phrase = literal_eval('u' + line.strip())
else: if ' (' in phrase and phrase.endswith(')'):
return [B_ENT] + [I_ENT] * (length - 2) + [L_ENT] phrase = phrase.split(' (', 1)[0]
if i >= n:
break
phrase = tokenizer(phrase)
if all((t.is_lower and t.prob >= -10) for t in phrase):
continue
if len(phrase) >= 2:
yield phrase
def make_matcher(vocab, max_length): def read_text(bz2_loc):
abstract_patterns = [] with BZ2File(bz2_loc) as file_:
for length in range(1, max_length+1): for line in file_:
abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) yield line.decode('utf8')
return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)})
def get_matches(matcher, pattern_ids, doc): def get_matches(tokenizer, phrases, texts, max_length=6):
matches = [] matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
for label, start, end in matcher(doc): print("Match")
candidate = doc[start : end] for text in texts:
if pattern_ids[hash_string(candidate.text)] == True: doc = tokenizer(text)
start = candidate[0].idx matches = matcher(doc)
end = candidate[-1].idx + len(candidate[-1]) for mwe in doc.ents:
matches.append((start, end, candidate.root.tag_, candidate.text)) yield mwe
return matches
def merge_matches(doc, matches): def main(patterns_loc, text_loc, counts_loc, n=10000000):
for start, end, tag, text in matches:
doc.merge(start, end, tag, text, 'MWE')
def main():
nlp = English(parser=False, tagger=False, entity=False) nlp = English(parser=False, tagger=False, entity=False)
print("Make matcher")
gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones'] phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.' counts = PreshCounter()
pattern_ids = PreshMap() t1 = time.time()
max_length = 0 for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
for pattern_str in gazetteer: counts.inc(hash_string(mwe.text), 1)
pattern = nlp.tokenizer(pattern_str) t2 = time.time()
bilou_tags = get_bilou(len(pattern)) print("10m tokens in %d s" % (t2 - t1))
for word, tag in zip(pattern, bilou_tags):
lexeme = nlp.vocab[word.orth]
lexeme.set_flag(tag, True)
pattern_ids[hash_string(pattern.text)] = True
max_length = max(max_length, len(pattern))
matcher = make_matcher(nlp.vocab, max_length)
doc = nlp(example_text)
matches = get_matches(matcher, pattern_ids, doc)
merge_matches(doc, matches)
for token in doc:
print(token.text, token.ent_type_)
with codecs.open(counts_loc, 'w', 'utf8') as file_:
for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
text = phrase.string
key = hash_string(text)
count = counts[key]
if count != 0:
file_.write('%d\t%s\n' % (count, text))
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) if False:
import cProfile
import pstats
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats()
else:
plac.call(main)

View File

@ -13,5 +13,7 @@
"ADP": {"pos": "ADP"}, "ADP": {"pos": "ADP"},
"SYM": {"pos": "SYM"}, "SYM": {"pos": "SYM"},
"X": {"pos": "X"}, "X": {"pos": "X"},
"INTJ": {"pos": "INTJ"} "INTJ": {"pos": "INTJ"},
"DET": {"pos": "DET"},
"PART": {"pos": "PART"}
} }

View File

@ -2,43 +2,43 @@
"S": {"pos": "NOUN"}, "S": {"pos": "NOUN"},
"E": {"pos": "ADP"}, "E": {"pos": "ADP"},
"RD": {"pos": "DET"}, "RD": {"pos": "DET"},
"V": {"pos": "VER"}, "V": {"pos": "VERB"},
"_": {"pos": "_"}, "_": {"pos": "NO_TAG"},
"A": {"pos": "ADJ"}, "A": {"pos": "ADJ"},
"SP": {"pos": "PROP"}, "SP": {"pos": "PROPN"},
"FF": {"pos": "PUNC"}, "FF": {"pos": "PUNCT"},
"FS": {"pos": "PUNC"}, "FS": {"pos": "PUNCT"},
"B": {"pos": "ADV"}, "B": {"pos": "ADV"},
"CC": {"pos": "CON"}, "CC": {"pos": "CONJ"},
"FB": {"pos": "PUNC"}, "FB": {"pos": "PUNCT"},
"VA": {"pos": "AUX"}, "VA": {"pos": "AUX"},
"PC": {"pos": "PRO"}, "PC": {"pos": "PRON"},
"N": {"pos": "NUM"}, "N": {"pos": "NUM"},
"RI": {"pos": "DET"}, "RI": {"pos": "DET"},
"PR": {"pos": "PRO"}, "PR": {"pos": "PRON"},
"CS": {"pos": "SCON"}, "CS": {"pos": "SCONJ"},
"BN": {"pos": "ADV"}, "BN": {"pos": "ADV"},
"AP": {"pos": "DET"}, "AP": {"pos": "DET"},
"VM": {"pos": "AUX"}, "VM": {"pos": "AUX"},
"DI": {"pos": "DET"}, "DI": {"pos": "DET"},
"FC": {"pos": "PUNC"}, "FC": {"pos": "PUNCT"},
"PI": {"pos": "PRO"}, "PI": {"pos": "PRON"},
"DD": {"pos": "DET"}, "DD": {"pos": "DET"},
"DQ": {"pos": "DET"}, "DQ": {"pos": "DET"},
"PQ": {"pos": "PRO"}, "PQ": {"pos": "PRON"},
"PD": {"pos": "PRO"}, "PD": {"pos": "PRON"},
"NO": {"pos": "ADJ"}, "NO": {"pos": "ADJ"},
"PE": {"pos": "PRO"}, "PE": {"pos": "PRON"},
"T": {"pos": "DET"}, "T": {"pos": "DET"},
"X": {"pos": "SYM"}, "X": {"pos": "SYM"},
"SW": {"pos": "X"}, "SW": {"pos": "X"},
"NO": {"pos": "PRO"}, "NO": {"pos": "PRON"},
"I": {"pos": "INT"}, "I": {"pos": "INTJ"},
"X": {"pos": "X"}, "X": {"pos": "X"},
"DR": {"pos": "DET"}, "DR": {"pos": "DET"},
"EA": {"pos": "ADP"}, "EA": {"pos": "ADP"},
"PP": {"pos": "PRO"}, "PP": {"pos": "PRON"},
"X": {"pos": "NUM"}, "X": {"pos": "NUM"},
"DE": {"pos": "DET"}, "DE": {"pos": "DET"},
"X": {"pos": "PAR"} "X": {"pos": "PART"}
} }

View File

@ -156,8 +156,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.morphology', 'spacy.tagger', 'spacy.morphology', 'spacy.tagger',
'spacy.syntax.stateclass', 'spacy.syntax.stateclass',
'spacy._ml', 'spacy._theano', 'spacy._ml', 'spacy._theano',
'spacy.tokenizer', 'spacy.en.attrs', 'spacy.tokenizer',
'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax.parser',
'spacy.syntax.transition_system', 'spacy.syntax.transition_system',
'spacy.syntax.arc_eager', 'spacy.syntax.arc_eager',
'spacy.syntax._parse_features', 'spacy.syntax._parse_features',

View File

@ -1,64 +0,0 @@
from ..attrs cimport FLAG13, FLAG14
from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21
from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28
from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32
from ..attrs cimport IS_ALPHA as _IS_ALPHA
from ..attrs cimport IS_DIGIT as _IS_DIGIT
from ..attrs cimport IS_ASCII as _IS_ASCII
from ..attrs cimport IS_LOWER as _IS_LOWER
from ..attrs cimport IS_PUNCT as _IS_PUNCT
from ..attrs cimport IS_SPACE as _IS_SPACE
from ..attrs cimport IS_TITLE as _IS_TITLE
from ..attrs cimport IS_UPPER as _IS_UPPER
from ..attrs cimport IS_OOV as _IS_OOV
from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL
from ..attrs cimport LIKE_URL as _LIKE_URL
from ..attrs cimport LIKE_NUM as _LIKE_NUM
from ..attrs cimport IS_STOP as _IS_STOP
from ..attrs cimport ORTH as _ORTH
from ..attrs cimport SHAPE as _SHAPE
from ..attrs cimport LOWER as _LOWER
from ..attrs cimport NORM as _NORM
from ..attrs cimport CLUSTER as _CLUSTER
from ..attrs cimport PREFIX as _PREFIX
from ..attrs cimport SUFFIX as _SUFFIX
from ..attrs cimport LEMMA as _LEMMA
from ..attrs cimport POS as _POS
from ..attrs cimport TAG as _TAG
from ..attrs cimport DEP as _DEP
from ..attrs cimport HEAD as _HEAD
from ..attrs cimport ENT_IOB as _ENT_IOB
from ..attrs cimport ENT_TYPE as _ENT_TYPE
from ..attrs cimport SPACY as _SPACY
cpdef enum:
IS_ALPHA = _IS_ALPHA
IS_ASCII = _IS_ASCII
IS_DIGIT = _IS_DIGIT
IS_LOWER = _IS_LOWER
IS_PUNCT = _IS_PUNCT
IS_SPACE = _IS_SPACE
IS_TITLE = _IS_TITLE
IS_UPPER = _IS_UPPER
LIKE_URL = _LIKE_URL
LIKE_NUM = _LIKE_NUM
LIKE_EMAIL = _LIKE_EMAIL
IS_STOP = _IS_STOP
IS_OOV = _IS_OOV
ORTH = _ORTH
SHAPE = _SHAPE
LOWER = _LOWER
NORM = _NORM
PREFIX = _PREFIX
SUFFIX = _SUFFIX
CLUSTER = _CLUSTER
LEMMA = _LEMMA
POS = _POS
TAG = _TAG
DEP = _DEP
ENT_IOB = _ENT_IOB
ENT_TYPE = _ENT_TYPE
HEAD = _HEAD
SPACY = _SPACY

View File

@ -1,21 +0,0 @@
# cython: embedsignature=True
from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
from ..orth cimport is_title, is_upper, like_url, like_number, like_email
from ..typedefs cimport flags_t
def get_flags(unicode string, is_oov=False):
cdef flags_t flags = 0
flags |= is_oov << IS_OOV
flags |= is_alpha(string) << IS_ALPHA
flags |= is_ascii(string) << IS_ASCII
flags |= is_digit(string) << IS_DIGIT
flags |= is_lower(string) << IS_LOWER
flags |= is_punct(string) << IS_PUNCT
flags |= is_space(string) << IS_SPACE
flags |= is_title(string) << IS_TITLE
flags |= is_upper(string) << IS_UPPER
flags |= like_url(string) << LIKE_URL
flags |= like_number(string) << LIKE_NUM
flags |= like_email(string) << LIKE_EMAIL
return flags

View File

@ -1,5 +0,0 @@
from ..tagger cimport Tagger
cdef class EnPosTagger(Tagger):
pass

View File

@ -1,11 +0,0 @@
from os import path
from ..parts_of_speech cimport NOUN, VERB, ADJ
from ..lemmatizer import Lemmatizer
cdef class EnPosTagger(Tagger):
"""A part-of-speech tagger for English"""
def make_lemmatizer(self, data_dir):
return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)

View File

@ -1,11 +1,18 @@
# cython: profile=True
from __future__ import unicode_literals
from os import path from os import path
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .attrs cimport attr_id_t from .attrs cimport attr_id_t
from .structs cimport TokenC from .structs cimport TokenC, LexemeC
from .lexeme cimport Lexeme
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector from libcpp.vector cimport vector
from murmurhash.mrmr cimport hash64
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
@ -15,6 +22,38 @@ from .vocab cimport Vocab
from libcpp.vector cimport vector from libcpp.vector cimport vector
from .attrs import FLAG61 as U_ENT
from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT
from .attrs import FLAG58 as B4_ENT
from .attrs import FLAG57 as B5_ENT
from .attrs import FLAG56 as B6_ENT
from .attrs import FLAG55 as B7_ENT
from .attrs import FLAG54 as B8_ENT
from .attrs import FLAG53 as B9_ENT
from .attrs import FLAG52 as B10_ENT
from .attrs import FLAG51 as I3_ENT
from .attrs import FLAG50 as I4_ENT
from .attrs import FLAG49 as I5_ENT
from .attrs import FLAG48 as I6_ENT
from .attrs import FLAG47 as I7_ENT
from .attrs import FLAG46 as I8_ENT
from .attrs import FLAG45 as I9_ENT
from .attrs import FLAG44 as I10_ENT
from .attrs import FLAG43 as L2_ENT
from .attrs import FLAG42 as L3_ENT
from .attrs import FLAG41 as L4_ENT
from .attrs import FLAG40 as L5_ENT
from .attrs import FLAG39 as L6_ENT
from .attrs import FLAG38 as L7_ENT
from .attrs import FLAG37 as L8_ENT
from .attrs import FLAG36 as L9_ENT
from .attrs import FLAG35 as L10_ENT
try: try:
import ujson as json import ujson as json
except ImportError: except ImportError:
@ -41,7 +80,7 @@ cdef Pattern* init_pattern(Pool mem, object token_specs, attr_t entity_type) exc
pattern[i].spec[j].attr = attr pattern[i].spec[j].attr = attr
pattern[i].spec[j].value = value pattern[i].spec[j].value = value
i = len(token_specs) i = len(token_specs)
pattern[i].spec = <AttrValue*>mem.alloc(1, sizeof(AttrValue)) pattern[i].spec = <AttrValue*>mem.alloc(2, sizeof(AttrValue))
pattern[i].spec[0].attr = ENT_TYPE pattern[i].spec[0].attr = ENT_TYPE
pattern[i].spec[0].value = entity_type pattern[i].spec[0].value = entity_type
pattern[i].spec[1].attr = LENGTH pattern[i].spec[1].attr = LENGTH
@ -81,7 +120,33 @@ def _convert_strings(token_specs, string_store):
value = int(value) value = int(value)
converted[-1].append((attr, value)) converted[-1].append((attr, value))
return converted return converted
def get_bilou(length):
if length == 1:
return [U_ENT]
elif length == 2:
return [B2_ENT, L2_ENT]
elif length == 3:
return [B3_ENT, I3_ENT, L3_ENT]
elif length == 4:
return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
elif length == 5:
return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
elif length == 6:
return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
elif length == 7:
return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
elif length == 8:
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
elif length == 9:
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
elif length == 10:
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
I10_ENT, I10_ENT, L10_ENT]
else:
raise ValueError("Max length currently 10 for phrase matching")
def map_attr_name(attr): def map_attr_name(attr):
attr = attr.upper() attr = attr.upper()
@ -95,32 +160,6 @@ def map_attr_name(attr):
return SHAPE return SHAPE
elif attr == 'NORM': elif attr == 'NORM':
return NORM return NORM
elif attr == 'FLAG13':
return FLAG13
elif attr == 'FLAG14':
return FLAG14
elif attr == 'FLAG15':
return FLAG15
elif attr == 'FLAG16':
return FLAG16
elif attr == 'FLAG17':
return FLAG17
elif attr == 'FLAG18':
return FLAG18
elif attr == 'FLAG19':
return FLAG19
elif attr == 'FLAG20':
return FLAG20
elif attr == 'FLAG21':
return FLAG21
elif attr == 'FLAG22':
return FLAG22
elif attr == 'FLAG23':
return FLAG23
elif attr == 'FLAG24':
return FLAG24
elif attr == 'FLAG25':
return FLAG25
else: else:
raise Exception("TODO: Finish supporting attr mapping %s" % attr) raise Exception("TODO: Finish supporting attr mapping %s" % attr)
@ -163,7 +202,7 @@ cdef class Matcher:
spec = _convert_strings(spec, self.vocab.strings) spec = _convert_strings(spec, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, spec, etype)) self.patterns.push_back(init_pattern(self.mem, spec, etype))
def __call__(self, Doc doc): def __call__(self, Doc doc, acceptor=None):
cdef vector[Pattern*] partials cdef vector[Pattern*] partials
cdef int n_partials = 0 cdef int n_partials = 0
cdef int q = 0 cdef int q = 0
@ -174,21 +213,94 @@ cdef class Matcher:
for token_i in range(doc.length): for token_i in range(doc.length):
token = &doc.data[token_i] token = &doc.data[token_i]
q = 0 q = 0
# Go over the open matches, extending or finalizing if able. Otherwise,
# we over-write them (q doesn't advance)
for i in range(partials.size()): for i in range(partials.size()):
state = partials.at(i) state = partials.at(i)
if match(state, token): if match(state, token):
if is_final(state): if is_final(state):
matches.append(get_entity(state, token, token_i)) label, start, end = get_entity(state, token, token_i)
if acceptor is None or acceptor(doc, label, start, end):
matches.append((label, start, end))
else: else:
partials[q] = state + 1 partials[q] = state + 1
q += 1 q += 1
partials.resize(q) partials.resize(q)
# Check whether we open any new patterns on this token
for i in range(self.n_patterns): for i in range(self.n_patterns):
state = self.patterns[i] state = self.patterns[i]
if match(state, token): if match(state, token):
if is_final(state): if is_final(state):
matches.append(get_entity(state, token, token_i)) label, start, end = get_entity(state, token, token_i)
if acceptor is None or acceptor(doc, label, start, end):
matches.append((label, start, end))
else: else:
partials.push_back(state + 1) partials.push_back(state + 1)
doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches
return matches return matches
cdef class PhraseMatcher:
cdef Pool mem
cdef Vocab vocab
cdef Matcher matcher
cdef PreshMap phrase_ids
cdef int max_length
cdef attr_t* _phrase_key
def __init__(self, Vocab vocab, phrases, max_length=10):
self.mem = Pool()
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
self.max_length = max_length
self.vocab = vocab
self.matcher = Matcher(self.vocab, {})
self.phrase_ids = PreshMap()
for phrase in phrases:
if len(phrase) < max_length:
self.add(phrase)
abstract_patterns = []
for length in range(1, max_length):
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
self.matcher.add('Candidate', 'MWE', {}, abstract_patterns)
def add(self, Doc tokens):
cdef int length = tokens.length
assert length < self.max_length
tags = get_bilou(length)
assert len(tags) == length, length
cdef int i
for i in range(self.max_length):
self._phrase_key[i] = 0
for i, tag in enumerate(tags):
lexeme = self.vocab[tokens.data[i].lex.orth]
lexeme.set_flag(tag, True)
self._phrase_key[i] = lexeme.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
self.phrase_ids[key] = True
def __call__(self, Doc doc):
matches = []
for label, start, end in self.matcher(doc, acceptor=self.accept_match):
cand = doc[start : end]
start = cand[0].idx
end = cand[-1].idx + len(cand[-1])
matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
for match in matches:
doc.merge(*match)
return matches
def accept_match(self, Doc doc, int label, int start, int end):
assert (end - start) < self.max_length
cdef int i, j
for i in range(self.max_length):
self._phrase_key[i] = 0
for i, j in enumerate(range(start, end)):
self._phrase_key[i] = doc.data[j].lex.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
if self.phrase_ids.get(key):
return True
else:
return False

View File

@ -31,10 +31,7 @@ cdef class Morphology:
cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int assign_tag(self, TokenC* token, tag) except -1:
cdef int tag_id cdef int tag_id
if isinstance(tag, basestring): if isinstance(tag, basestring):
try: tag_id = self.reverse_index[self.strings[tag]]
tag_id = self.reverse_index[self.strings[tag]]
except KeyError:
raise
else: else:
tag_id = tag tag_id = tag
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth) analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)

View File

@ -11,6 +11,7 @@ try:
except ImportError: except ImportError:
from text_unidecode import unidecode from text_unidecode import unidecode
import re import re
import math import math
@ -165,7 +166,7 @@ cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0):
cpdef bytes asciied(unicode string): cpdef bytes asciied(unicode string):
cdef str stripped = unidecode(string) stripped = unidecode(string)
if not stripped: if not stripped:
return b'???' return b'???'
return stripped.encode('ascii') return stripped.encode('ascii')

View File

@ -96,7 +96,9 @@ cdef class Vocab:
lex = <LexemeC*>self._by_hash.get(key) lex = <LexemeC*>self._by_hash.get(key)
cdef size_t addr cdef size_t addr
if lex != NULL: if lex != NULL:
assert lex.orth == self.strings[string] if lex.orth != self.strings[string]:
raise LookupError.mismatched_strings(
lex.orth, self.strings[lex.orth], string)
return lex return lex
else: else:
return self._new_lexeme(mem, string) return self._new_lexeme(mem, string)
@ -352,6 +354,21 @@ def write_binary_vectors(in_loc, out_loc):
out_file.write_from(vec, vec_len, sizeof(float)) out_file.write_from(vec, vec_len, sizeof(float))
class LookupError(Exception):
@classmethod
def mismatched_strings(cls, id_, id_string, original_string):
return cls(
"Error fetching a Lexeme from the Vocab. When looking up a string, "
"the lexeme returned had an orth ID that did not match the query string. "
"This means that the cached lexeme structs are mismatched to the "
"string encoding table. The mismatched:\n"
"Query string: {query}\n"
"Orth cached: {orth_str}\n"
"ID of orth: {orth_id}".format(
query=original_string, orth_str=id_string, orth_id=id_)
)
class VectorReadError(Exception): class VectorReadError(Exception):
@classmethod @classmethod
def mismatched_sizes(cls, loc, line_num, prev_size, curr_size): def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):

View File

@ -3,6 +3,7 @@ import pytest
from spacy.matcher import Matcher from spacy.matcher import Matcher
@pytest.mark.xfail
def test_overlap_issue118(EN): def test_overlap_issue118(EN):
'''Test a bug that arose from having overlapping matches''' '''Test a bug that arose from having overlapping matches'''
doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.en import attrs from spacy import attrs
def test_attr_of_token(EN): def test_attr_of_token(EN):

View File

@ -1,8 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English from spacy.en import English
from spacy.en.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT from spacy.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT
from spacy.en.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM from spacy.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM
from spacy.en.attrs import IS_STOP from spacy.attrs import IS_STOP
import pytest import pytest

View File

@ -2,7 +2,7 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.en.attrs import * from spacy.attrs import *
def test_is_alpha(en_vocab): def test_is_alpha(en_vocab):

View File

@ -26,6 +26,7 @@ def test_main_entry_point(nlp):
doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
@pytest.mark.models
def test_sentence_spans(nlp): def test_sentence_spans(nlp):
# from spacy.en import English # from spacy.en import English
# nlp = English() # nlp = English()
@ -33,6 +34,7 @@ def test_sentence_spans(nlp):
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"] assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
@pytest.mark.models
def test_entity_spans(nlp): def test_entity_spans(nlp):
# from spacy.en import English # from spacy.en import English
# nlp = English() # nlp = English()
@ -44,6 +46,7 @@ def test_entity_spans(nlp):
assert ents[0].string == ents[0].string assert ents[0].string == ents[0].string
@pytest.mark.models
def test_noun_chunk_spans(nlp): def test_noun_chunk_spans(nlp):
# from spacy.en import English # from spacy.en import English
# nlp = English() # nlp = English()
@ -56,11 +59,12 @@ def test_noun_chunk_spans(nlp):
# NP three noun chunks <-- has # NP three noun chunks <-- has
@pytest.mark.models
def test_count_by(nlp): def test_count_by(nlp):
# from spacy.en import English, attrs # from spacy.en import English, attrs
# nlp = English() # nlp = English()
import numpy import numpy
from spacy.en import attrs from spacy import attrs
tokens = nlp('apple apple orange banana') tokens = nlp('apple apple orange banana')
assert tokens.count_by(attrs.ORTH) == {2529: 2, 4117: 1, 6650: 1} assert tokens.count_by(attrs.ORTH) == {2529: 2, 4117: 1, 6650: 1}
assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[2529], assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[2529],
@ -88,6 +92,7 @@ def test_token_span(doc):
assert token.i == 4 assert token.i == 4
@pytest.mark.models
def test_example_i_like_new_york1(nlp): def test_example_i_like_new_york1(nlp):
toks = nlp('I like New York in Autumn.') toks = nlp('I like New York in Autumn.')
@ -127,16 +132,19 @@ def dot(toks):
return tok(toks, "dot") return tok(toks, "dot")
@pytest.mark.models
def test_example_i_like_new_york3(toks, new, york): def test_example_i_like_new_york3(toks, new, york):
assert toks[new].head.orth_ == 'York' assert toks[new].head.orth_ == 'York'
assert toks[york].head.orth_ == 'like' assert toks[york].head.orth_ == 'like'
@pytest.mark.models
def test_example_i_like_new_york4(toks, new, york): def test_example_i_like_new_york4(toks, new, york):
new_york = toks[new:york+1] new_york = toks[new:york+1]
assert new_york.root.orth_ == 'York' assert new_york.root.orth_ == 'York'
@pytest.mark.models
def test_example_i_like_new_york5(toks, autumn, dot): def test_example_i_like_new_york5(toks, autumn, dot):
assert toks[autumn].head.orth_ == 'in' assert toks[autumn].head.orth_ == 'in'
assert toks[dot].head.orth_ == 'like' assert toks[dot].head.orth_ == 'like'
@ -144,6 +152,7 @@ def test_example_i_like_new_york5(toks, autumn, dot):
assert autumn_dot.root.orth_ == 'Autumn' assert autumn_dot.root.orth_ == 'Autumn'
@pytest.mark.models
def test_navigating_the_parse_tree_lefts(doc): def test_navigating_the_parse_tree_lefts(doc):
# TODO: where does the span object come from? # TODO: where does the span object come from?
span = doc[:2] span = doc[:2]
@ -151,6 +160,7 @@ def test_navigating_the_parse_tree_lefts(doc):
if span.doc[i].head in span] if span.doc[i].head in span]
@pytest.mark.models
def test_navigating_the_parse_tree_rights(doc): def test_navigating_the_parse_tree_rights(doc):
span = doc[:2] span = doc[:2]
rights = [span.doc[i] for i in range(span.end, len(span.doc)) rights = [span.doc[i] for i in range(span.end, len(span.doc))

View File

@ -1,6 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
import spacy.en import spacy
@pytest.fixture() @pytest.fixture()
@ -22,6 +22,7 @@ def test_get_tokens_and_sentences(doc):
assert sentence.text == 'Hello, world.' assert sentence.text == 'Hello, world.'
@pytest.mark.models
def test_use_integer_ids_for_any_strings(nlp, token): def test_use_integer_ids_for_any_strings(nlp, token):
hello_id = nlp.vocab.strings['Hello'] hello_id = nlp.vocab.strings['Hello']
hello_str = nlp.vocab.strings[hello_id] hello_str = nlp.vocab.strings[hello_id]
@ -45,7 +46,7 @@ def test_get_and_set_string_views_and_flags(nlp, token):
def test_export_to_numpy_arrays(nlp, doc): def test_export_to_numpy_arrays(nlp, doc):
from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV from spacy.attrs import ORTH, LIKE_URL, IS_OOV
attr_ids = [ORTH, LIKE_URL, IS_OOV] attr_ids = [ORTH, LIKE_URL, IS_OOV]
doc_array = doc.to_array(attr_ids) doc_array = doc.to_array(attr_ids)
@ -68,6 +69,7 @@ def test_word_vectors(nlp):
assert apples.similarity(oranges) > boots.similarity(hippos) assert apples.similarity(oranges) > boots.similarity(hippos)
@pytest.mark.models
def test_part_of_speech_tags(nlp): def test_part_of_speech_tags(nlp):
from spacy.parts_of_speech import ADV from spacy.parts_of_speech import ADV

View File

@ -24,7 +24,7 @@ include ./meta.jade
p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses. p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses.
p It doesn't always guess right, but we can tell how often it does, and we can think of ways t help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit. p It doesn't always guess right, but we can tell how often it does, and we can think of ways to help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit.
p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.) p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.)