Merge remote-tracking branch 'upstream/master'

2025-07-13 01:32:32 +03:00 · 2015-10-08 22:53:02 -07:00 · 2015-10-08 22:53:02 -07:00 · 9a6811acc4
commit 9a6811acc4
parent 0f601b8b75 00c1992503
24 changed files with 534 additions and 241 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -24,4 +24,4 @@ install:

 # run tests
 script:
-  - "py.test tests/ website/tests/ -x"
+  - "py.test tests/ -x"
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -148,8 +148,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
    nlp.end_training(model_dir)
    print('done')

+
 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
-             beam_width=None):
+             beam_width=None, cand_preproc=None):
    nlp = Language(data_dir=model_dir)
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
@ -166,16 +167,14 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
-                tokens = nlp(raw_text, merge_mwes=False)
+                tokens = nlp(raw_text)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=verbose)
    return scorer


-def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
+def write_parses(Language, dev_loc, model_dir, out_loc):
    nlp = Language(data_dir=model_dir)
-    if beam_width is not None:
-        nlp.parser.cfg.beam_width = beam_width
    gold_tuples = read_json_file(dev_loc)
    scorer = Scorer()
    out_file = codecs.open(out_loc, 'w', 'utf8')
@ -188,14 +187,16 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
-                tokens = nlp(raw_text, merge_mwes=False)
-            gold = GoldParse(tokens, annot_tuples)
-            scorer.score(tokens, gold, verbose=False)
-            for t in tokens:
-                out_file.write(
-                    '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
-                )
-    return scorer
+                tokens = nlp(raw_text)
+            #gold = GoldParse(tokens, annot_tuples)
+            #scorer.score(tokens, gold, verbose=False)
+            for sent in tokens.sents:
+                for t in sent:
+                    if not t.is_space:
+                        out_file.write(
+                            '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
+                        )
+                out_file.write('\n')


@plac.annotations(
@ -220,14 +221,15 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
              gold_preproc=gold_preproc, n_sents=n_sents,
              corruption_level=corruption_level, n_iter=n_iter,
              verbose=verbose)
-    #if out_loc:
-    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
+    if out_loc:
+        write_parses(English, dev_loc, model_dir, out_loc)
    scorer = evaluate(English, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)
+    print('SBD', scorer.sbd_acc)

    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
--- a/bin/parser/train_ud.py
+++ b/bin/parser/train_ud.py
@ -0,0 +1,151 @@
+import plac
+import json
+from os import path
+import shutil
+import os
+import random
+
+from spacy.syntax.util import Config
+from spacy.gold import GoldParse
+from spacy.tokenizer import Tokenizer
+from spacy.vocab import Vocab
+from spacy.tagger import Tagger
+from spacy.syntax.parser import Parser
+from spacy.syntax.arc_eager import ArcEager
+from spacy.syntax.parser import get_templates
+from spacy.scorer import Scorer
+
+from spacy.language import Language
+
+from spacy.tagger import W_orth
+
+TAGGER_TEMPLATES = (
+    (W_orth,),
+)
+
+try:
+    from codecs import open
+except ImportError:
+    pass
+
+
+class TreebankParser(object):
+    @staticmethod
+    def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0):
+        dep_model_dir = path.join(model_dir, 'deps')
+        pos_model_dir = path.join(model_dir, 'pos')
+        if path.exists(dep_model_dir):
+            shutil.rmtree(dep_model_dir)
+        if path.exists(pos_model_dir):
+            shutil.rmtree(pos_model_dir)
+        os.mkdir(dep_model_dir)
+        os.mkdir(pos_model_dir)
+
+        Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
+                     labels=labels)
+
+    @classmethod
+    def from_dir(cls, tag_map, model_dir):
+        vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
+        tokenizer = Tokenizer(vocab, {}, None, None, None)
+        tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)
+
+        cfg = Config.read(path.join(model_dir, 'deps'), 'config')
+        parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
+        return cls(vocab, tokenizer, tagger, parser)
+
+    def __init__(self, vocab, tokenizer, tagger, parser):
+        self.vocab = vocab
+        self.tokenizer = tokenizer
+        self.tagger = tagger
+        self.parser = parser
+
+    def train(self, words, tags, heads, deps):
+        tokens = self.tokenizer.tokens_from_list(list(words))
+        self.tagger.train(tokens, tags)
+        
+        tokens = self.tokenizer.tokens_from_list(list(words))
+        ids = range(len(words))
+        ner = ['O'] * len(words)
+        gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)),
+                         make_projective=False)
+        self.tagger(tokens)
+        if gold.is_projective:
+            try:
+                self.parser.train(tokens, gold)
+            except:
+                for id_, word, head, dep in zip(ids, words, heads, deps):
+                    print(id_, word, head, dep)
+                raise
+
+    def __call__(self, words, tags=None):
+        tokens = self.tokenizer.tokens_from_list(list(words))
+        if tags is None:
+            self.tagger(tokens)
+        else:
+            self.tagger.tag_from_strings(tokens, tags)
+        self.parser(tokens)
+        return tokens
+
+    def end_training(self, data_dir):
+        self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
+        self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
+        self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
+ 
+
+def read_conllx(loc):
+    with open(loc, 'r', 'utf8') as file_:
+        text = file_.read()
+    for sent in text.strip().split('\n\n'):
+        lines = sent.strip().split('\n')
+        if lines:
+            if lines[0].startswith('#'):
+                lines.pop(0)
+            tokens = []
+            for line in lines:
+                id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
+                if '-' in id_:
+                    continue
+                id_ = int(id_) - 1
+                head = (int(head) - 1) if head != '0' else id_
+                dep = 'ROOT' if dep == 'root' else dep
+                tokens.append((id_, word, tag, head, dep, 'O'))
+            tuples = zip(*tokens)
+            yield (None, [(tuples, [])])
+
+
+def score_model(nlp, gold_docs, verbose=False):
+    scorer = Scorer()
+    for _, gold_doc in gold_docs:
+        for annot_tuples, _ in gold_doc:
+            tokens = nlp(list(annot_tuples[1]), tags=list(annot_tuples[2]))
+            gold = GoldParse(tokens, annot_tuples)
+            scorer.score(tokens, gold, verbose=verbose)
+    return scorer
+
+
+def main(train_loc, dev_loc, model_dir, tag_map_loc):
+    with open(tag_map_loc) as file_:
+        tag_map = json.loads(file_.read())
+    train_sents = list(read_conllx(train_loc))
+    labels = ArcEager.get_labels(train_sents)
+    templates = get_templates('basic')
+
+    TreebankParser.setup_model_dir(model_dir, labels, templates)
+    
+    nlp = TreebankParser.from_dir(tag_map, model_dir)
+
+    for itn in range(15):
+        for _, doc_sents in train_sents:
+            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
+                nlp.train(words, tags, heads, deps)
+        random.shuffle(train_sents)
+        scorer = score_model(nlp, read_conllx(dev_loc))
+        print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
+    nlp.end_training(model_dir)
+    scorer = score_model(nlp, read_conllx(dev_loc))
+    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
+ 
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/contributors/chrisdubois.md
+++ b/contributors/chrisdubois.md
@ -0,0 +1,95 @@
+Syllogism Contributor Agreement
+===============================
+
+This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
+Agreement. The SCA applies to any contribution that you make to any product or
+project managed by us (the “project”), and sets out the intellectual property
+rights you grant to us in the contributed materials. The term “us” shall mean
+Syllogism Co. The term "you" shall mean the person or entity identified below.
+If you agree to be bound by these terms, fill in the information requested below
+and include the filled-in version with your first pull-request, under the file
+contrbutors/. The name of the file should be your GitHub username, with the
+extension .md. For example, the user example_user would create the file
+spaCy/contributors/example_user.md .
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+1. The term 'contribution' or ‘contributed materials’ means any source code,
+object code, patch, tool, sample, graphic, specification, manual, documentation,
+or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and registrations,
+in your contribution:
+  * you hereby assign to us joint ownership, and to the extent that such assignment
+  is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual,
+  irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license
+  to exercise all rights under those copyrights. This includes, at our option, the
+  right to sublicense these same rights to third parties through multiple levels of
+  sublicensees or other licensing arrangements;
+
+  * you agree that each of us can do all things in relation to your contribution
+  as if each of us were the sole owners, and if one of us makes a derivative work
+  of your contribution, the one who makes the derivative work (or has it made) will
+  be the sole owner of that derivative work;
+
+  * you agree that you will not assert any moral rights in your contribution against
+  us, our licensees or transferees;
+  
+  * you agree that we may register a copyright in your contribution and exercise
+  all ownership rights associated with it; and
+
+  * you agree that neither of us has any duty to consult with, obtain the consent
+  of, pay or render an accounting to the other for any use or distribution of your
+  contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive,
+worldwide, no-charge, royalty-free license to:
+
+  * make, have made, use, sell, offer to sell, import, and otherwise transfer your
+  contribution in whole or in part, alone or in combination with
+  or included in any product, work or materials arising out of the project to
+  which your contribution was submitted, and
+
+  * at our option, to sublicense these same rights to third parties through multiple
+  levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective on
+the date you first submitted a contribution to us, even if your submission took
+place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+  * Each contribution that you submit is and shall be an original work of authorship
+  and you can legally grant the rights set out in this SCA;
+  
+  * to the best of your knowledge, each contribution will not violate any third
+  party's copyrights, trademarks, patents, or other intellectual property rights; and
+
+  * each contribution shall be in compliance with U.S. export control laws and other
+  applicable export and import laws. You agree to notify us if you become aware of
+  any circumstance which would make any of the foregoing representations inaccurate
+  in any respect. Syllogism Co. may publicly disclose your participation in the project,
+  including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable U.S.
+  Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+x I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions.
+
+____ I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Chris DuBois         |
+| Company's name (if applicable) |                      |
+| Title or Role (if applicable)  |                      |
+| Date                           | 2015.10.07           |
+| GitHub username                | chrisdubois          |
+| Website (optional)             |                      |
+
--- a/examples/information_extraction.py
+++ b/examples/information_extraction.py
@ -45,14 +45,14 @@ def main():
    nlp = English()
    texts = [
        u'Net income was $9.4 million compared to the prior year of $2.7 million.',
-        u'Revenue exceeded twelve billion dollars, with a loss of $1b',
+        u'Revenue exceeded twelve billion dollars, with a loss of $1b.',
    ]
               
    for text in texts:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
-            print(r1.text, r2.ent_type_)
+            print(r1.text, r2.ent_type_, r2.text)


 if __name__ == '__main__':
--- a/examples/multi_word_matches.py
+++ b/examples/multi_word_matches.py
@ -22,73 +22,77 @@ our pattern set stays very small (exact size depends on the maximum length we're
 looking for, as the query language currently has no quantifiers)
 """
 from __future__ import print_function, unicode_literals, division
+from ast import literal_eval
+from bz2 import BZ2File
+import time
+import math
+import codecs

 import plac

 from preshed.maps import PreshMap
+from preshed.counter import PreshCounter
 from spacy.strings import hash_string
 from spacy.en import English
-from spacy.matcher import Matcher
-
-from spacy.attrs import FLAG63 as U_ENT
-from spacy.attrs import FLAG62 as L_ENT
-from spacy.attrs import FLAG61 as I_ENT
-from spacy.attrs import FLAG60 as B_ENT
+from spacy.matcher import PhraseMatcher


-def get_bilou(length):
-    if length == 1:
-        return [U_ENT]
-    else:
-        return [B_ENT] + [I_ENT] * (length - 2) + [L_ENT]
+def read_gazetteer(tokenizer, loc, n=-1):
+    for i, line in enumerate(open(loc)):
+        phrase = literal_eval('u' + line.strip())
+        if ' (' in phrase and phrase.endswith(')'):
+            phrase = phrase.split(' (', 1)[0]
+        if i >= n:
+            break
+        phrase = tokenizer(phrase)
+        if all((t.is_lower and t.prob >= -10) for t in phrase):
+            continue
+        if len(phrase) >= 2:
+            yield phrase


-def make_matcher(vocab, max_length):
-    abstract_patterns = []
-    for length in range(1, max_length+1):
-        abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
-    return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)})
+def read_text(bz2_loc):
+    with BZ2File(bz2_loc) as file_:
+        for line in file_:
+            yield line.decode('utf8')


-def get_matches(matcher, pattern_ids, doc):
-    matches = []
-    for label, start, end in matcher(doc):
-        candidate = doc[start : end]
-        if pattern_ids[hash_string(candidate.text)] == True:
-            start = candidate[0].idx
-            end = candidate[-1].idx + len(candidate[-1])
-            matches.append((start, end, candidate.root.tag_, candidate.text))
-    return matches
+def get_matches(tokenizer, phrases, texts, max_length=6):
+    matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
+    print("Match")
+    for text in texts:
+        doc = tokenizer(text)
+        matches = matcher(doc)
+        for mwe in doc.ents:
+            yield mwe


-def merge_matches(doc, matches):
-    for start, end, tag, text in matches:
-        doc.merge(start, end, tag, text, 'MWE')
-
-
-def main():
+def main(patterns_loc, text_loc, counts_loc, n=10000000):
    nlp = English(parser=False, tagger=False, entity=False)
-
-    gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones']
-    example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.'
-    pattern_ids = PreshMap()
-    max_length = 0
-    for pattern_str in gazetteer:
-        pattern = nlp.tokenizer(pattern_str)
-        bilou_tags = get_bilou(len(pattern))
-        for word, tag in zip(pattern, bilou_tags):
-            lexeme = nlp.vocab[word.orth]
-            lexeme.set_flag(tag, True)
-        pattern_ids[hash_string(pattern.text)] = True
-        max_length = max(max_length, len(pattern))
-
-    matcher = make_matcher(nlp.vocab, max_length)
-
-    doc = nlp(example_text)
-    matches = get_matches(matcher, pattern_ids, doc)
-    merge_matches(doc, matches)
-    for token in doc:
-        print(token.text, token.ent_type_)
+    print("Make matcher")
+    phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
+    counts = PreshCounter()
+    t1 = time.time()
+    for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
+        counts.inc(hash_string(mwe.text), 1)
+    t2 = time.time()
+    print("10m tokens in %d s" % (t2 - t1))
    
+    with codecs.open(counts_loc, 'w', 'utf8') as file_:
+        for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
+            text = phrase.string
+            key = hash_string(text)
+            count = counts[key]
+            if count != 0:
+                file_.write('%d\t%s\n' % (count, text))
+    
+
 if __name__ == '__main__':
-    plac.call(main)
+    if False:
+        import cProfile
+        import pstats
+        cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
+        s = pstats.Stats("Profile.prof")
+        s.strip_dirs().sort_stats("time").print_stats()
+    else:
+        plac.call(main)
--- a/lang_data/fi/tag_map.json
+++ b/lang_data/fi/tag_map.json
@ -13,5 +13,7 @@
    "ADP": {"pos": "ADP"},
    "SYM": {"pos": "SYM"},
    "X": {"pos": "X"},
-    "INTJ": {"pos": "INTJ"}
+    "INTJ": {"pos": "INTJ"},
+    "DET": {"pos": "DET"},
+    "PART": {"pos": "PART"}
 }
--- a/lang_data/it/tag_map.json
+++ b/lang_data/it/tag_map.json
@ -2,43 +2,43 @@
 "S": {"pos": "NOUN"},
 "E":   {"pos": "ADP"},
 "RD":  {"pos": "DET"},
-"V":   {"pos": "VER"},
-"_":   {"pos": "_"},
+"V":   {"pos": "VERB"},
+"_":   {"pos": "NO_TAG"},
 "A":   {"pos": "ADJ"},
-"SP":  {"pos": "PROP"},
-"FF":  {"pos": "PUNC"},
-"FS":  {"pos": "PUNC"},
+"SP":  {"pos": "PROPN"},
+"FF":  {"pos": "PUNCT"},
+"FS":  {"pos": "PUNCT"},
 "B":   {"pos": "ADV"},
-"CC":  {"pos": "CON"},
-"FB":  {"pos": "PUNC"},
+"CC":  {"pos": "CONJ"},
+"FB":  {"pos": "PUNCT"},
 "VA":  {"pos": "AUX"},
-"PC":  {"pos": "PRO"},
+"PC":  {"pos": "PRON"},
 "N":   {"pos": "NUM"},
 "RI":  {"pos": "DET"},
-"PR":  {"pos": "PRO"},
-"CS":  {"pos": "SCON"},
+"PR":  {"pos": "PRON"},
+"CS":  {"pos": "SCONJ"},
 "BN":  {"pos": "ADV"},
 "AP":  {"pos": "DET"},
 "VM":  {"pos": "AUX"},
 "DI":  {"pos": "DET"},
-"FC":  {"pos": "PUNC"},
-"PI":  {"pos": "PRO"},
+"FC":  {"pos": "PUNCT"},
+"PI":  {"pos": "PRON"},
 "DD":  {"pos": "DET"},
 "DQ":  {"pos": "DET"},
-"PQ":  {"pos": "PRO"},
-"PD":  {"pos": "PRO"},
+"PQ":  {"pos": "PRON"},
+"PD":  {"pos": "PRON"},
 "NO":  {"pos": "ADJ"},
-"PE":  {"pos": "PRO"},
+"PE":  {"pos": "PRON"},
 "T":   {"pos": "DET"},
 "X":   {"pos": "SYM"},
 "SW":  {"pos": "X"},
-"NO":  {"pos": "PRO"},
-"I":   {"pos": "INT"},
+"NO":  {"pos": "PRON"},
+"I":   {"pos": "INTJ"},
 "X":   {"pos": "X"},
 "DR":  {"pos": "DET"},
 "EA":  {"pos": "ADP"},
-"PP":  {"pos": "PRO"},
+"PP":  {"pos": "PRON"},
 "X":   {"pos": "NUM"},
 "DE":  {"pos": "DET"},
-"X":   {"pos": "PAR"}
+"X":   {"pos": "PART"}
 }
--- a/setup.py
+++ b/setup.py
@ -156,8 +156,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
             'spacy.morphology', 'spacy.tagger',
             'spacy.syntax.stateclass', 
             'spacy._ml', 'spacy._theano',
-             'spacy.tokenizer', 'spacy.en.attrs',
-             'spacy.en.pos', 'spacy.syntax.parser', 
+             'spacy.tokenizer',
+             'spacy.syntax.parser', 
             'spacy.syntax.transition_system',
             'spacy.syntax.arc_eager',
             'spacy.syntax._parse_features',
--- a/spacy/en/attrs.pxd
+++ b/spacy/en/attrs.pxd
@ -1,64 +0,0 @@
-from ..attrs cimport FLAG13, FLAG14
-from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21
-from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28
-from ..attrs cimport FLAG29,  FLAG30, FLAG31, FLAG32
-from ..attrs cimport IS_ALPHA as _IS_ALPHA
-from ..attrs cimport IS_DIGIT as _IS_DIGIT
-from ..attrs cimport IS_ASCII as _IS_ASCII
-from ..attrs cimport IS_LOWER as _IS_LOWER
-from ..attrs cimport IS_PUNCT as _IS_PUNCT
-from ..attrs cimport IS_SPACE as _IS_SPACE
-from ..attrs cimport IS_TITLE as _IS_TITLE
-from ..attrs cimport IS_UPPER as _IS_UPPER
-from ..attrs cimport IS_OOV as _IS_OOV
-from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL
-from ..attrs cimport LIKE_URL as _LIKE_URL
-from ..attrs cimport LIKE_NUM as _LIKE_NUM
-from ..attrs cimport IS_STOP as _IS_STOP
-from ..attrs cimport ORTH as _ORTH
-from ..attrs cimport SHAPE as _SHAPE
-from ..attrs cimport LOWER as _LOWER
-from ..attrs cimport NORM as _NORM
-from ..attrs cimport CLUSTER as _CLUSTER
-from ..attrs cimport PREFIX as _PREFIX
-from ..attrs cimport SUFFIX as _SUFFIX
-from ..attrs cimport LEMMA as _LEMMA
-from ..attrs cimport POS as _POS
-from ..attrs cimport TAG as _TAG
-from ..attrs cimport DEP as _DEP
-from ..attrs cimport HEAD as _HEAD
-from ..attrs cimport ENT_IOB as _ENT_IOB
-from ..attrs cimport ENT_TYPE as _ENT_TYPE
-from ..attrs cimport SPACY as _SPACY
-
-
-cpdef enum:
-    IS_ALPHA = _IS_ALPHA
-    IS_ASCII = _IS_ASCII
-    IS_DIGIT = _IS_DIGIT
-    IS_LOWER = _IS_LOWER
-    IS_PUNCT = _IS_PUNCT
-    IS_SPACE = _IS_SPACE
-    IS_TITLE = _IS_TITLE
-    IS_UPPER = _IS_UPPER
-    LIKE_URL = _LIKE_URL
-    LIKE_NUM = _LIKE_NUM
-    LIKE_EMAIL = _LIKE_EMAIL
-    IS_STOP = _IS_STOP
-    IS_OOV = _IS_OOV
- 
-    ORTH = _ORTH
-    SHAPE = _SHAPE
-    LOWER = _LOWER
-    NORM = _NORM
-    PREFIX = _PREFIX
-    SUFFIX = _SUFFIX
-    CLUSTER = _CLUSTER
-    LEMMA = _LEMMA
-    POS = _POS
-    TAG = _TAG
-    DEP = _DEP
-    ENT_IOB = _ENT_IOB
-    ENT_TYPE = _ENT_TYPE
-    HEAD = _HEAD
-    SPACY = _SPACY
--- a/spacy/en/attrs.pyx
+++ b/spacy/en/attrs.pyx
@ -1,21 +0,0 @@
-# cython: embedsignature=True
-from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
-from ..orth cimport is_title, is_upper, like_url, like_number, like_email
-from ..typedefs cimport flags_t
-
-
-def get_flags(unicode string, is_oov=False):
-    cdef flags_t flags = 0
-    flags |= is_oov << IS_OOV
-    flags |= is_alpha(string) << IS_ALPHA
-    flags |= is_ascii(string) << IS_ASCII
-    flags |= is_digit(string) << IS_DIGIT
-    flags |= is_lower(string) << IS_LOWER
-    flags |= is_punct(string) << IS_PUNCT
-    flags |= is_space(string) << IS_SPACE
-    flags |= is_title(string) << IS_TITLE
-    flags |= is_upper(string) << IS_UPPER
-    flags |= like_url(string) << LIKE_URL
-    flags |= like_number(string) << LIKE_NUM
-    flags |= like_email(string) << LIKE_EMAIL
-    return flags
--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@ -1,5 +0,0 @@
-from ..tagger cimport Tagger
-
-
-cdef class EnPosTagger(Tagger):
-    pass
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -1,11 +0,0 @@
-from os import path
-
-from ..parts_of_speech cimport NOUN, VERB, ADJ
-
-from ..lemmatizer import Lemmatizer
-
-
-cdef class EnPosTagger(Tagger):
-    """A part-of-speech tagger for English"""
-    def make_lemmatizer(self, data_dir):
-        return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -1,11 +1,18 @@
+# cython: profile=True
+from __future__ import unicode_literals
+
 from os import path

 from .typedefs cimport attr_t
+from .typedefs cimport hash_t
 from .attrs cimport attr_id_t
-from .structs cimport TokenC
+from .structs cimport TokenC, LexemeC
+from .lexeme cimport Lexeme

 from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
+from murmurhash.mrmr cimport hash64

 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
 from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
@ -15,6 +22,38 @@ from .vocab cimport Vocab

 from libcpp.vector cimport vector

+from .attrs import FLAG61 as U_ENT
+
+from .attrs import FLAG60 as B2_ENT
+from .attrs import FLAG59 as B3_ENT
+from .attrs import FLAG58 as B4_ENT
+from .attrs import FLAG57 as B5_ENT
+from .attrs import FLAG56 as B6_ENT
+from .attrs import FLAG55 as B7_ENT
+from .attrs import FLAG54 as B8_ENT
+from .attrs import FLAG53 as B9_ENT
+from .attrs import FLAG52 as B10_ENT
+
+from .attrs import FLAG51 as I3_ENT
+from .attrs import FLAG50 as I4_ENT
+from .attrs import FLAG49 as I5_ENT
+from .attrs import FLAG48 as I6_ENT
+from .attrs import FLAG47 as I7_ENT
+from .attrs import FLAG46 as I8_ENT
+from .attrs import FLAG45 as I9_ENT
+from .attrs import FLAG44 as I10_ENT
+
+from .attrs import FLAG43 as L2_ENT
+from .attrs import FLAG42 as L3_ENT
+from .attrs import FLAG41 as L4_ENT
+from .attrs import FLAG40 as L5_ENT
+from .attrs import FLAG39 as L6_ENT
+from .attrs import FLAG38 as L7_ENT
+from .attrs import FLAG37 as L8_ENT
+from .attrs import FLAG36 as L9_ENT
+from .attrs import FLAG35 as L10_ENT
+
+
 try:
    import ujson as json
 except ImportError:
@ -41,7 +80,7 @@ cdef Pattern* init_pattern(Pool mem, object token_specs, attr_t entity_type) exc
            pattern[i].spec[j].attr = attr
            pattern[i].spec[j].value = value
    i = len(token_specs)
-    pattern[i].spec = <AttrValue*>mem.alloc(1, sizeof(AttrValue))
+    pattern[i].spec = <AttrValue*>mem.alloc(2, sizeof(AttrValue))
    pattern[i].spec[0].attr = ENT_TYPE
    pattern[i].spec[0].value = entity_type
    pattern[i].spec[1].attr = LENGTH
@ -81,7 +120,33 @@ def _convert_strings(token_specs, string_store):
                value = int(value)
            converted[-1].append((attr, value))
    return converted
-    
+
+
+def get_bilou(length):
+    if length == 1:
+        return [U_ENT]
+    elif length == 2:
+        return [B2_ENT, L2_ENT]
+    elif length == 3:
+        return [B3_ENT, I3_ENT, L3_ENT]
+    elif length == 4:
+        return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
+    elif length == 5:
+        return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
+    elif length == 6:
+        return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
+    elif length == 7:
+        return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
+    elif length == 8:
+        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
+    elif length == 9:
+        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
+    elif length == 10:
+        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
+                I10_ENT, I10_ENT, L10_ENT]
+    else:
+        raise ValueError("Max length currently 10 for phrase matching")
+

 def map_attr_name(attr):
    attr = attr.upper()
@ -95,32 +160,6 @@ def map_attr_name(attr):
        return SHAPE
    elif attr == 'NORM':
        return NORM
-    elif attr == 'FLAG13':
-        return FLAG13
-    elif attr == 'FLAG14':
-        return FLAG14
-    elif attr == 'FLAG15':
-        return FLAG15
-    elif attr == 'FLAG16':
-        return FLAG16
-    elif attr == 'FLAG17':
-        return FLAG17
-    elif attr == 'FLAG18':
-        return FLAG18
-    elif attr == 'FLAG19':
-        return FLAG19
-    elif attr == 'FLAG20':
-        return FLAG20
-    elif attr == 'FLAG21':
-        return FLAG21
-    elif attr == 'FLAG22':
-        return FLAG22
-    elif attr == 'FLAG23':
-        return FLAG23
-    elif attr == 'FLAG24':
-        return FLAG24
-    elif attr == 'FLAG25':
-        return FLAG25
    else:
        raise Exception("TODO: Finish supporting attr mapping %s" % attr)

@ -163,7 +202,7 @@ cdef class Matcher:
            spec = _convert_strings(spec, self.vocab.strings)
            self.patterns.push_back(init_pattern(self.mem, spec, etype))

-    def __call__(self, Doc doc):
+    def __call__(self, Doc doc, acceptor=None):
        cdef vector[Pattern*] partials
        cdef int n_partials = 0
        cdef int q = 0
@ -174,21 +213,94 @@ cdef class Matcher:
        for token_i in range(doc.length):
            token = &doc.data[token_i]
            q = 0
+            # Go over the open matches, extending or finalizing if able. Otherwise,
+            # we over-write them (q doesn't advance)
            for i in range(partials.size()):
                state = partials.at(i)
                if match(state, token):
                    if is_final(state):
-                        matches.append(get_entity(state, token, token_i))
+                        label, start, end = get_entity(state, token, token_i)
+                        if acceptor is None or acceptor(doc, label, start, end):
+                            matches.append((label, start, end))
                    else:
                        partials[q] = state + 1
                        q += 1
            partials.resize(q)
+            # Check whether we open any new patterns on this token
            for i in range(self.n_patterns):
                state = self.patterns[i]
                if match(state, token):
                    if is_final(state):
-                        matches.append(get_entity(state, token, token_i))
+                        label, start, end = get_entity(state, token, token_i)
+                        if acceptor is None or acceptor(doc, label, start, end):
+                            matches.append((label, start, end))
                    else:
                        partials.push_back(state + 1)
        doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches
        return matches
+
+
+cdef class PhraseMatcher:
+    cdef Pool mem
+    cdef Vocab vocab
+    cdef Matcher matcher
+    cdef PreshMap phrase_ids
+
+    cdef int max_length
+    cdef attr_t* _phrase_key
+
+    def __init__(self, Vocab vocab, phrases, max_length=10):
+        self.mem = Pool()
+        self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
+        self.max_length = max_length
+        self.vocab = vocab
+        self.matcher = Matcher(self.vocab, {})
+        self.phrase_ids = PreshMap()
+        for phrase in phrases:
+            if len(phrase) < max_length:
+                self.add(phrase)
+
+        abstract_patterns = []
+        for length in range(1, max_length):
+            abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
+        self.matcher.add('Candidate', 'MWE', {}, abstract_patterns)
+
+    def add(self, Doc tokens):
+        cdef int length = tokens.length
+        assert length < self.max_length
+        tags = get_bilou(length)
+        assert len(tags) == length, length
+        
+        cdef int i
+        for i in range(self.max_length):
+            self._phrase_key[i] = 0
+        for i, tag in enumerate(tags):
+            lexeme = self.vocab[tokens.data[i].lex.orth]
+            lexeme.set_flag(tag, True)
+            self._phrase_key[i] = lexeme.orth
+        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
+        self.phrase_ids[key] = True
+
+    def __call__(self, Doc doc):
+        matches = []
+        for label, start, end in self.matcher(doc, acceptor=self.accept_match):
+            cand = doc[start : end]
+            start = cand[0].idx
+            end = cand[-1].idx + len(cand[-1])
+            matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
+        for match in matches:
+            doc.merge(*match)
+        return matches
+
+    def accept_match(self, Doc doc, int label, int start, int end):
+        assert (end - start) < self.max_length
+        cdef int i, j
+        for i in range(self.max_length):
+            self._phrase_key[i] = 0
+        for i, j in enumerate(range(start, end)):
+            self._phrase_key[i] = doc.data[j].lex.orth
+        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
+        if self.phrase_ids.get(key):
+            return True
+        else:
+            return False
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -31,10 +31,7 @@ cdef class Morphology:
    cdef int assign_tag(self, TokenC* token, tag) except -1:
        cdef int tag_id
        if isinstance(tag, basestring):
-            try:
-                tag_id = self.reverse_index[self.strings[tag]]
-            except KeyError:
-                raise
+            tag_id = self.reverse_index[self.strings[tag]]
        else:
            tag_id = tag
        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -11,6 +11,7 @@ try:
 except ImportError:
    from text_unidecode import unidecode

+
 import re

 import math
@ -165,7 +166,7 @@ cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0):


 cpdef bytes asciied(unicode string):
-    cdef str stripped = unidecode(string)
+    stripped = unidecode(string)
    if not stripped:
        return b'???'
    return stripped.encode('ascii')
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -96,7 +96,9 @@ cdef class Vocab:
        lex = <LexemeC*>self._by_hash.get(key)
        cdef size_t addr
        if lex != NULL:
-            assert lex.orth == self.strings[string]
+            if lex.orth != self.strings[string]:
+                raise LookupError.mismatched_strings(
+                    lex.orth, self.strings[lex.orth], string)
            return lex
        else:
            return self._new_lexeme(mem, string)
@ -352,6 +354,21 @@ def write_binary_vectors(in_loc, out_loc):
            out_file.write_from(vec, vec_len, sizeof(float))


+class LookupError(Exception):
+    @classmethod
+    def mismatched_strings(cls, id_, id_string, original_string):
+        return cls(
+            "Error fetching a Lexeme from the Vocab. When looking up a string, "
+            "the lexeme returned had an orth ID that did not match the query string. "
+            "This means that the cached lexeme structs are mismatched to the "
+            "string encoding table. The mismatched:\n"
+            "Query string: {query}\n"
+            "Orth cached: {orth_str}\n"
+            "ID of orth: {orth_id}".format(
+                query=original_string, orth_str=id_string, orth_id=id_)
+        )
+
+
 class VectorReadError(Exception):
    @classmethod
    def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):
--- a/tests/matcher/test_matcher_bugfixes.py
+++ b/tests/matcher/test_matcher_bugfixes.py
@ -3,6 +3,7 @@ import pytest

 from spacy.matcher import Matcher

+@pytest.mark.xfail
 def test_overlap_issue118(EN):
    '''Test a bug that arose from having overlapping matches'''
    doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
--- a/tests/tokens/test_array.py
+++ b/tests/tokens/test_array.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals

 import pytest

-from spacy.en import attrs
+from spacy import attrs


 def test_attr_of_token(EN):
--- a/tests/tokens/test_token_api.py
+++ b/tests/tokens/test_token_api.py
@ -1,8 +1,8 @@
 from __future__ import unicode_literals
 from spacy.en import English
-from spacy.en.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT
-from spacy.en.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM
-from spacy.en.attrs import IS_STOP
+from spacy.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT
+from spacy.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM
+from spacy.attrs import IS_STOP

 import pytest

--- a/tests/vocab/test_lexeme_flags.py
+++ b/tests/vocab/test_lexeme_flags.py
@ -2,7 +2,7 @@ from __future__ import unicode_literals

 import pytest

-from spacy.en.attrs import *
+from spacy.attrs import *


 def test_is_alpha(en_vocab):
--- a/tests/website/test_api.py
+++ b/tests/website/test_api.py
@ -26,6 +26,7 @@ def test_main_entry_point(nlp):
    doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.


+@pytest.mark.models
 def test_sentence_spans(nlp):
    # from spacy.en import English
    # nlp = English()
@ -33,6 +34,7 @@ def test_sentence_spans(nlp):
    assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]


+@pytest.mark.models
 def test_entity_spans(nlp):
    # from spacy.en import English
    # nlp = English()
@ -44,6 +46,7 @@ def test_entity_spans(nlp):
    assert ents[0].string == ents[0].string


+@pytest.mark.models
 def test_noun_chunk_spans(nlp):
    # from spacy.en import English
    # nlp = English()
@ -56,11 +59,12 @@ def test_noun_chunk_spans(nlp):
    # NP three noun chunks <-- has


+@pytest.mark.models
 def test_count_by(nlp):
    # from spacy.en import English, attrs
    # nlp = English()
    import numpy
-    from spacy.en import attrs
+    from spacy import attrs
    tokens = nlp('apple apple orange banana')
    assert tokens.count_by(attrs.ORTH) == {2529: 2, 4117: 1, 6650: 1}
    assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[2529],
@ -88,6 +92,7 @@ def test_token_span(doc):
    assert token.i == 4


+@pytest.mark.models
 def test_example_i_like_new_york1(nlp):
    toks = nlp('I like New York in Autumn.')

@ -127,16 +132,19 @@ def dot(toks):
    return tok(toks, "dot")


+@pytest.mark.models
 def test_example_i_like_new_york3(toks, new, york):
    assert toks[new].head.orth_ == 'York'
    assert toks[york].head.orth_ == 'like'


+@pytest.mark.models
 def test_example_i_like_new_york4(toks, new, york):
    new_york = toks[new:york+1]
    assert new_york.root.orth_ == 'York'


+@pytest.mark.models
 def test_example_i_like_new_york5(toks, autumn, dot):
    assert toks[autumn].head.orth_ == 'in'
    assert toks[dot].head.orth_ == 'like'
@ -144,6 +152,7 @@ def test_example_i_like_new_york5(toks, autumn, dot):
    assert autumn_dot.root.orth_ == 'Autumn'


+@pytest.mark.models
 def test_navigating_the_parse_tree_lefts(doc):
    # TODO: where does the span object come from?
    span = doc[:2]
@ -151,6 +160,7 @@ def test_navigating_the_parse_tree_lefts(doc):
             if span.doc[i].head in span]


+@pytest.mark.models
 def test_navigating_the_parse_tree_rights(doc):
    span = doc[:2]
    rights = [span.doc[i] for i in range(span.end, len(span.doc))
--- a/tests/website/test_home.py
+++ b/tests/website/test_home.py
@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 import pytest
-import spacy.en
+import spacy


@pytest.fixture()
@ -22,6 +22,7 @@ def test_get_tokens_and_sentences(doc):
    assert sentence.text == 'Hello, world.'


+@pytest.mark.models
 def test_use_integer_ids_for_any_strings(nlp, token):
    hello_id = nlp.vocab.strings['Hello']
    hello_str = nlp.vocab.strings[hello_id]
@ -45,7 +46,7 @@ def test_get_and_set_string_views_and_flags(nlp, token):


 def test_export_to_numpy_arrays(nlp, doc):
-    from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
+    from spacy.attrs import ORTH, LIKE_URL, IS_OOV

    attr_ids = [ORTH, LIKE_URL, IS_OOV]
    doc_array = doc.to_array(attr_ids)
@ -68,6 +69,7 @@ def test_word_vectors(nlp):
    assert apples.similarity(oranges) > boots.similarity(hippos)


+@pytest.mark.models
 def test_part_of_speech_tags(nlp):
    from spacy.parts_of_speech import ADV

--- a/website/src/jade/blog/eli5-computers-learn-reading/index.jade
+++ b/website/src/jade/blog/eli5-computers-learn-reading/index.jade
@ -24,7 +24,7 @@ include ./meta.jade

    p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses.

-    p It doesn't always guess right, but we can tell how often it does, and we can think of ways t help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit.
+    p It doesn't always guess right, but we can tell how often it does, and we can think of ways to help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit.

    p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.)