From a77413167120b6bffffce9793c80a4a9b1f9ffa9 Mon Sep 17 00:00:00 2001 From: Oleg Zdornyy Date: Wed, 9 Mar 2016 18:44:33 -0800 Subject: [PATCH 1/9] Added reloadable English() example for inv. count --- .gitignore | 5 +- examples/InventoryCount/Instructions.md | 5 ++ examples/InventoryCount/inventory.py | 35 +++++++++ examples/InventoryCount/inventoryCount.py | 92 +++++++++++++++++++++++ examples/InventoryCount/main.py | 31 ++++++++ 5 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 examples/InventoryCount/Instructions.md create mode 100644 examples/InventoryCount/inventory.py create mode 100644 examples/InventoryCount/inventoryCount.py create mode 100644 examples/InventoryCount/main.py diff --git a/.gitignore b/.gitignore index 40a800245..5c75b8b05 100644 --- a/.gitignore +++ b/.gitignore @@ -96,5 +96,8 @@ setup.py # Windows local helper files *.bat +# Mac OS X +*.DS_Store + # Komodo project files -*.komodoproject \ No newline at end of file +*.komodoproject diff --git a/examples/InventoryCount/Instructions.md b/examples/InventoryCount/Instructions.md new file mode 100644 index 000000000..456f5d4fe --- /dev/null +++ b/examples/InventoryCount/Instructions.md @@ -0,0 +1,5 @@ +An example of inventory counting using SpaCy.io NLP library. Meant to show how to instantiate Spacy's English class, and allow reusability by reloading the main module. + +In the future, a better implementation of this library would be to apply machine learning to each query and learn what to classify as the quantitative statement (55 kgs OF), vs the actual item of count (how likely is a preposition object to be the item of count if x,y,z qualifications appear in the statement). + + diff --git a/examples/InventoryCount/inventory.py b/examples/InventoryCount/inventory.py new file mode 100644 index 000000000..abc031513 --- /dev/null +++ b/examples/InventoryCount/inventory.py @@ -0,0 +1,35 @@ +class Inventory: + """ + Inventory class - a struct{} like feature to house inventory counts + across modules. + """ + originalQuery = None + item = "" + unit = "" + amount = "" + + def __init__(self, statement): + """ + Constructor - only takes in the original query/statement + :return: new Inventory object + """ + + self.originalQuery = statement + pass + + def __str__(self): + return str(self.amount) + ' ' + str(self.unit) + ' ' + str(self.item) + + def printInfo(self): + print '-------------Inventory Count------------' + print "Original Query: " + str(self.originalQuery) + print 'Amount: ' + str(self.amount) + print 'Unit: ' + str(self.unit) + print 'Item: ' + str(self.item) + print '----------------------------------------' + + def isValid(self): + if not self.item or not self.unit or not self.amount or not self.originalQuery: + return False + else: + return True diff --git a/examples/InventoryCount/inventoryCount.py b/examples/InventoryCount/inventoryCount.py new file mode 100644 index 000000000..b1b7b43c8 --- /dev/null +++ b/examples/InventoryCount/inventoryCount.py @@ -0,0 +1,92 @@ +from inventory import Inventory + + +def runTest(nlp): + testset = [] + testset += [nlp(u'6 lobster cakes')] + testset += [nlp(u'6 avacados')] + testset += [nlp(u'fifty five carrots')] + testset += [nlp(u'i have 55 carrots')] + testset += [nlp(u'i got me some 9 cabbages')] + testset += [nlp(u'i got 65 kgs of carrots')] + + result = [] + for doc in testset: + c = decodeInventoryEntry_level1(doc) + if not c.isValid(): + c = decodeInventoryEntry_level2(doc) + result.append(c) + + for i in result: + i.printInfo() + + +def decodeInventoryEntry_level1(document): + """ + Decodes a basic entry such as: '6 lobster cake' or '6' cakes + @param document : NLP Doc object + :return: Status if decoded correctly (true, false), and Inventory object + """ + count = Inventory(str(document)) + for token in document: + if token.pos_ == (u'NOUN' or u'NNS' or u'NN'): + item = str(token) + + for child in token.children: + if child.dep_ == u'compound' or child.dep_ == u'ad': + item = str(child) + str(item) + elif child.dep_ == u'nummod': + count.amount = str(child).strip() + for numerical_child in child.children: + # this isn't arithmetic rather than treating it such as a string + count.amount = str(numerical_child) + str(count.amount).strip() + else: + print "WARNING: unknown child: " + str(child) + ':'+str(child.dep_) + + count.item = item + count.unit = item + + return count + + +def decodeInventoryEntry_level2(document): + """ + Entry level 2, a more complicated parsing scheme that covers examples such as + 'i have 80 boxes of freshly baked pies' + + @document @param document : NLP Doc object + :return: Status if decoded correctly (true, false), and Inventory object- + """ + + count = Inventory(str(document)) + + for token in document: + # Look for a preposition object that is a noun (this is the item we are counting). + # If found, look at its' dependency (if a preposition that is not indicative of + # inventory location, the dependency of the preposition must be a noun + + if token.dep_ == (u'pobj' or u'meta') and token.pos_ == (u'NOUN' or u'NNS' or u'NN'): + item = '' + + # Go through all the token's children, these are possible adjectives and other add-ons + # this deals with cases such as 'hollow rounded waffle pancakes" + for i in token.children: + item += ' ' + str(i) + + item += ' ' + str(token) + count.item = item + + # Get the head of the item: + if token.head.dep_ != u'prep': + # Break out of the loop, this is a confusing entry + break + else: + amountUnit = token.head.head + count.unit = str(amountUnit) + + for inner in amountUnit.children: + if inner.pos_ == u'NUM': + count.amount += str(inner) + return count + + diff --git a/examples/InventoryCount/main.py b/examples/InventoryCount/main.py new file mode 100644 index 000000000..497a740cb --- /dev/null +++ b/examples/InventoryCount/main.py @@ -0,0 +1,31 @@ +import inventoryCount as mainModule +import os +from spacy.en import English, LOCAL_DATA_DIR +data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) + +if __name__ == '__main__': + """ + Main module for this example - loads the English main NLP class, + and keeps it in RAM while waiting for the user to re-run it. Allows the + developer to re-edit their module under testing without having + to wait as long to load the English class + """ + + # Set the NLP object here for the parameters you want to see, + # or just leave it blank and get all the opts + print "Loading English module... this will take a while." + nlp = English() + print "Done loading English module." + while True: + try: + reload(mainModule) + mainModule.runTest(nlp) + raw_input('================ To reload main module, press Enter ================') + + + except Exception, e: + print "Unexpected error: " + str(e) + continue + + + From 46e3f979f10facd2a9c05f4f52a8529cd49a1e47 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Fri, 11 Mar 2016 17:31:06 +0100 Subject: [PATCH 2/9] add function for setting head and label to token change PseudoProjectivity.deprojectivize to use these functions --- spacy/syntax/nonproj.pyx | 19 +++-- spacy/tests/tokens/test_token_api.py | 64 +++++++++++++++ spacy/tokens/token.pxd | 2 +- spacy/tokens/token.pyx | 116 +++++++++++++++++++++++++++ 4 files changed, 192 insertions(+), 9 deletions(-) diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index fb04ecb2d..9339efb39 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -118,15 +118,18 @@ class PseudoProjectivity: # reattach arcs with decorated labels (following HEAD scheme) # for each decorated arc X||Y, search top-down, left-to-right, # breadth-first until hitting a Y then make this the new head - parse = tokens.to_array([HEAD, DEP]) - labels = [ tokens.vocab.strings[int(p[1])] for p in parse ] + #parse = tokens.to_array([HEAD, DEP]) for token in tokens: if cls.is_decorated(token.dep_): newlabel,headlabel = cls.decompose(token.dep_) newhead = cls._find_new_head(token,headlabel) - parse[token.i,1] = tokens.vocab.strings[newlabel] - parse[token.i,0] = newhead.i - token.i - tokens.from_array([HEAD, DEP],parse) + token.head = newhead + token.dep_ = newlabel + + # tokens.attach(token,newhead,newlabel) + #parse[token.i,1] = tokens.vocab.strings[newlabel] + #parse[token.i,0] = newhead.i - token.i + #tokens.from_array([HEAD, DEP],parse) @classmethod @@ -168,7 +171,7 @@ class PseudoProjectivity: @classmethod def _find_new_head(cls, token, headlabel): - # search through the tree starting from root + # search through the tree starting from the head of the given token # returns the id of the first descendant with the given label # if there is none, return the current head (no change) queue = [token.head] @@ -176,8 +179,8 @@ class PseudoProjectivity: next_queue = [] for qtoken in queue: for child in qtoken.children: - if child == token: - continue + if child.is_space: continue + if child == token: continue if child.dep_ == headlabel: return child next_queue.append(child) diff --git a/spacy/tests/tokens/test_token_api.py b/spacy/tests/tokens/test_token_api.py index 6deaadfbf..fba8a4d67 100644 --- a/spacy/tests/tokens/test_token_api.py +++ b/spacy/tests/tokens/test_token_api.py @@ -62,3 +62,67 @@ def test_vectors(EN): assert sum(apples.vector) != sum(oranges.vector) assert apples.vector_norm != oranges.vector_norm +@pytest.mark.models +def test_ancestors(EN): + # the structure of this sentence depends on the English annotation scheme + tokens = EN(u'Yesterday I saw a dog that barked loudly.') + ancestors = [ t.orth_ for t in tokens[6].ancestors ] + assert ancestors == ['dog','saw'] + ancestors = [ t.orth_ for t in tokens[1].ancestors ] + assert ancestors == ['saw'] + ancestors = [ t.orth_ for t in tokens[2].ancestors ] + assert ancestors == [] + + assert tokens[2].is_ancestor_of(tokens[7]) + assert not tokens[6].is_ancestor_of(tokens[2]) + + +@pytest.mark.models +def test_head_setter(EN): + # the structure of this sentence depends on the English annotation scheme + yesterday, i, saw, a, dog, that, barked, loudly, dot = EN(u'Yesterday I saw a dog that barked loudly.') + assert barked.n_lefts == 1 + assert barked.n_rights == 1 + assert barked.left_edge == that + assert barked.right_edge == loudly + + assert dog.n_lefts == 1 + assert dog.n_rights == 1 + assert dog.left_edge == a + assert dog.right_edge == loudly + + assert a.n_lefts == 0 + assert a.n_rights == 0 + assert a.left_edge == a + assert a.right_edge == a + + assert saw.left_edge == yesterday + assert saw.right_edge == dot + + barked.head = a + + assert barked.n_lefts == 1 + assert barked.n_rights == 1 + assert barked.left_edge == that + assert barked.right_edge == loudly + + assert a.n_lefts == 0 + assert a.n_rights == 1 + assert a.left_edge == a + assert a.right_edge == loudly + + assert dog.n_lefts == 1 + assert dog.n_rights == 0 + assert dog.left_edge == a + assert dog.right_edge == loudly + + assert saw.left_edge == yesterday + assert saw.right_edge == dot + + yesterday.head = that + + assert that.left_edge == yesterday + assert barked.left_edge == yesterday + assert a.left_edge == yesterday + assert dog.left_edge == yesterday + assert saw.left_edge == yesterday diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index 8ff0e9587..1706cdc55 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -6,7 +6,7 @@ from .doc cimport Doc cdef class Token: cdef Vocab vocab - cdef const TokenC* c + cdef TokenC* c cdef readonly int i cdef int array_len cdef readonly Doc doc diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 0ff574f1b..c7ba92260 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -142,6 +142,8 @@ cdef class Token: property dep: def __get__(self): return self.c.dep + def __set__(self, int label): + self.c.dep = label property has_vector: def __get__(self): @@ -250,10 +252,122 @@ cdef class Token: def __get__(self): return self.doc[self.c.r_edge] + property ancestors: + def __get__(self): + cdef const TokenC* head_ptr = self.c + # guard against infinite loop, no token can have + # more ancestors than tokens in the tree + cdef int i = 0 + while head_ptr.head != 0 and i < self.doc.length: + head_ptr += head_ptr.head + yield self.doc[head_ptr - (self.c - self.i)] + i += 1 + + def is_ancestor_of(self, descendant): + return any( ancestor.i == self.i for ancestor in descendant.ancestors ) + property head: def __get__(self): """The token predicted by the parser to be the head of the current token.""" return self.doc[self.i + self.c.head] + def __set__(self, Token new_head): + # this function sets the head of self to new_head + # and updates the counters for left/right dependents + # and left/right corner for the new and the old head + + # do nothing if old head is new head + if self.i + self.c.head == new_head.i: + return + + cdef Token old_head = self.head + cdef int rel_newhead_i = new_head.i - self.i + + # is the new head a descendant of the old head + cdef bint is_desc = old_head.is_ancestor_of(new_head) + + cdef int token_i + cdef int new_edge + cdef Token anc + + # update number of deps of old head + if self.c.head > 0: # left dependent + old_head.c.l_kids -= 1 + if self.c.l_edge == old_head.c.l_edge: + # the token dominates the left edge so the left edge of the head + # may change when the token is reattached + # it may not change if the new head is a descendant of the current head + + # find new l_edge if new head is not a descendant of old head + # a new l_edge is any token between l_edge and old_head + # that is a descendant of old_head but not of self + new_edge = self.c.l_edge + if not is_desc: + for token_i in range(old_head.l_edge+1,old_head.i): + if self.doc.c[token_i].l_kids == 0: # only a token without left deps can be a left edge + if self.is_ancestor_of(self.doc[token_i]): + continue + if old_head.is_ancestor_of(self.doc[token_i]): + new_edge = token_i + break + else: # set the new l_edge to old_head if no other was found + new_edge = old_head.i + + # assign new l_edge to old_head + old_head.c.l_edge = new_edge + # walk up the tree from old_head and assign new l_edge to ancestors + # until an ancestor already has an l_edge that's further left + for anc in old_head.ancestors: + if anc.c.l_edge <= new_edge: + break + anc.c.l_edge = new_edge + + elif self.c.head < 0: # right dependent + old_head.c.r_kids -= 1 + # do the same thing as for l_edge + if self.c.r_edge == old_head.c.r_edge: + new_edge = self.c.r_edge + if not is_desc: + for token_i in range(old_head.r_edge-1,old_head.i,-1): + if self.doc.c[token_i].r_kids == 0: + if self.is_ancestor_of(self.doc[token_i]): + continue + if old_head.is_ancestor_of(self.doc[token_i]): + new_edge = token_i + break + else: + new_edge = old_head.i + old_head.c.r_edge = new_edge + for anc in old_head.ancestors: + if anc.c.r_edge >= new_edge: + break + anc.c.r_edge = new_edge + + # update number of deps of new head + if rel_newhead_i > 0: # left dependent + new_head.c.l_kids += 1 + # walk up the tree from new head and set l_edge to self.l_edge + # until you hit a token with an l_edge further to the left + if self.c.l_edge < new_head.c.l_edge: + new_edge = self.c.l_edge + new_head.c.l_edge = new_edge + for anc in new_head.ancestors: + if anc.c.l_edge <= new_edge: + break + anc.c.l_edge = new_edge + + elif rel_newhead_i < 0: # right dependent + new_head.c.r_kids += 1 + # do the same as for l_edge + if self.c.r_edge > new_head.c.r_edge: + new_edge = self.c.r_edge + new_head.c.r_edge = new_edge + for anc in new_head.ancestors: + if anc.c.r_edge >= new_edge: + break + anc.c.r_edge = new_edge + + # set new head + self.c.head = rel_newhead_i property conjuncts: def __get__(self): @@ -325,6 +439,8 @@ cdef class Token: property dep_: def __get__(self): return self.vocab.strings[self.c.dep] + def __set__(self, unicode label): + self.c.dep = self.vocab.strings[label] property is_oov: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV) From 2ae253ef5b9e2f5d846da38dd9d9658bf139919e Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Mon, 14 Mar 2016 13:43:48 +0100 Subject: [PATCH 3/9] changed head.__set__ to make it simpler --- spacy/tokens/token.pyx | 59 ++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index c7ba92260..8b920934c 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -285,9 +285,8 @@ cdef class Token: # is the new head a descendant of the old head cdef bint is_desc = old_head.is_ancestor_of(new_head) - cdef int token_i cdef int new_edge - cdef Token anc + cdef Token anc, child # update number of deps of old head if self.c.head > 0: # left dependent @@ -297,23 +296,18 @@ cdef class Token: # may change when the token is reattached # it may not change if the new head is a descendant of the current head - # find new l_edge if new head is not a descendant of old head - # a new l_edge is any token between l_edge and old_head - # that is a descendant of old_head but not of self new_edge = self.c.l_edge + # the new l_edge is the left-most l_edge on any of the other dependents + # where the l_edge is left of the head, otherwise it is the head if not is_desc: - for token_i in range(old_head.l_edge+1,old_head.i): - if self.doc.c[token_i].l_kids == 0: # only a token without left deps can be a left edge - if self.is_ancestor_of(self.doc[token_i]): - continue - if old_head.is_ancestor_of(self.doc[token_i]): - new_edge = token_i - break - else: # set the new l_edge to old_head if no other was found - new_edge = old_head.i + new_edge = old_head.i + for child in old_head.children: + if child == self: + continue + if child.c.l_edge < new_edge: + new_edge = child.c.l_edge + old_head.c.l_edge = new_edge - # assign new l_edge to old_head - old_head.c.l_edge = new_edge # walk up the tree from old_head and assign new l_edge to ancestors # until an ancestor already has an l_edge that's further left for anc in old_head.ancestors: @@ -326,17 +320,16 @@ cdef class Token: # do the same thing as for l_edge if self.c.r_edge == old_head.c.r_edge: new_edge = self.c.r_edge + if not is_desc: - for token_i in range(old_head.r_edge-1,old_head.i,-1): - if self.doc.c[token_i].r_kids == 0: - if self.is_ancestor_of(self.doc[token_i]): - continue - if old_head.is_ancestor_of(self.doc[token_i]): - new_edge = token_i - break - else: - new_edge = old_head.i - old_head.c.r_edge = new_edge + new_edge = old_head.i + for child in old_head.children: + if child == self: + continue + if child.c.r_edge > new_edge: + new_edge = child.c.r_edge + old_head.c.r_edge = new_edge + for anc in old_head.ancestors: if anc.c.r_edge >= new_edge: break @@ -348,23 +341,21 @@ cdef class Token: # walk up the tree from new head and set l_edge to self.l_edge # until you hit a token with an l_edge further to the left if self.c.l_edge < new_head.c.l_edge: - new_edge = self.c.l_edge - new_head.c.l_edge = new_edge + new_head.c.l_edge = self.c.l_edge for anc in new_head.ancestors: - if anc.c.l_edge <= new_edge: + if anc.c.l_edge <= self.c.l_edge: break - anc.c.l_edge = new_edge + anc.c.l_edge = self.c.l_edge elif rel_newhead_i < 0: # right dependent new_head.c.r_kids += 1 # do the same as for l_edge if self.c.r_edge > new_head.c.r_edge: - new_edge = self.c.r_edge - new_head.c.r_edge = new_edge + new_head.c.r_edge = self.c.r_edge for anc in new_head.ancestors: - if anc.c.r_edge >= new_edge: + if anc.c.r_edge >= self.c.r_edge: break - anc.c.r_edge = new_edge + anc.c.r_edge = self.c.r_edge # set new head self.c.head = rel_newhead_i From a7d7ea3afa776132d5f46f2f1b59a4deeda1748c Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Thu, 24 Mar 2016 11:19:43 +0100 Subject: [PATCH 4/9] first idea for supporting multiple langs in download script --- spacy/__init__.py | 9 ++++++-- spacy/about.py | 14 ++++++++++++- spacy/de/download.py | 13 ++++++++++++ spacy/download.py | 33 +++++++++++++++++++++++++++++ spacy/en/download.py | 49 ++------------------------------------------ spacy/util.py | 14 +++++++------ 6 files changed, 76 insertions(+), 56 deletions(-) create mode 100644 spacy/de/download.py create mode 100644 spacy/download.py diff --git a/spacy/__init__.py b/spacy/__init__.py index 70e72b7a1..b09ee3491 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,8 +1,13 @@ from . import util -from .en import English +from .about import __models__ +import importlib def load(name, vectors=None, via=None): - return English( + if name not in __models__: + raise Exception('Model %s not found.' % name) + + mod = importlib.import_module('.%s' % __models__[name]['module'], 'spacy') + return getattr(mod, __models__[name]['class'])( package=util.get_package_by_name(name, via=via), vectors_package=util.get_package_by_name(vectors, via=via)) diff --git a/spacy/about.py b/spacy/about.py index 3814b8d61..eed7c3f81 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -10,4 +10,16 @@ __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' __email__ = 'matt@spacy.io' __license__ = 'MIT' -__default_model__ = 'en>=1.0.0,<1.1.0' +__models__ = { + 'en': { + 'module': 'en', + 'class': 'English', + 'package': 'en>=1.0.0,<1.1.0', + }, + 'de': { + 'module': 'de', + 'class': 'German', + 'package': 'de>=1.0.0,<1.1.0', + }, +} +__default_model__ = 'en' diff --git a/spacy/de/download.py b/spacy/de/download.py new file mode 100644 index 000000000..ba57c1d31 --- /dev/null +++ b/spacy/de/download.py @@ -0,0 +1,13 @@ +import plac +from ..download import download + + +@plac.annotations( + force=("Force overwrite", "flag", "f", bool), +) +def main(data_size='all', force=False): + download('de', force) + + +if __name__ == '__main__': + plac.call(main) diff --git a/spacy/download.py b/spacy/download.py new file mode 100644 index 000000000..537c06872 --- /dev/null +++ b/spacy/download.py @@ -0,0 +1,33 @@ +from __future__ import print_function + +import sys + +import sputnik +from sputnik.package_list import (PackageNotFoundException, + CompatiblePackageNotFoundException) + +from . import about + + +def download(lang, force=False): + if force: + sputnik.purge(about.__title__, about.__version__) + + try: + sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + print("Model already installed. Please run 'python -m " + "spacy.%s.download --force' to reinstall." % lang, file=sys.stderr) + sys.exit(1) + except (PackageNotFoundException, CompatiblePackageNotFoundException): + pass + + package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]['package']) + + try: + sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + except (PackageNotFoundException, CompatiblePackageNotFoundException): + print("Model failed to install. Please run 'python -m " + "spacy.%s.download --force'." % lang, file=sys.stderr) + sys.exit(1) + + print("Model successfully installed.", file=sys.stderr) diff --git a/spacy/en/download.py b/spacy/en/download.py index 993b8b16d..f0c23b088 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,57 +1,12 @@ -from __future__ import print_function - -import sys -import os -import shutil - import plac -import sputnik -from sputnik.package_list import (PackageNotFoundException, - CompatiblePackageNotFoundException) - -from .. import about - - -def migrate(path): - data_path = os.path.join(path, 'data') - if os.path.isdir(data_path): - if os.path.islink(data_path): - os.unlink(data_path) - else: - shutil.rmtree(data_path) - for filename in os.listdir(path): - if filename.endswith('.tgz'): - os.unlink(os.path.join(path, filename)) +from ..download import download @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - if force: - sputnik.purge(about.__title__, about.__version__) - - try: - sputnik.package(about.__title__, about.__version__, about.__default_model__) - print("Model already installed. Please run 'python -m " - "spacy.en.download --force' to reinstall.", file=sys.stderr) - sys.exit(1) - except (PackageNotFoundException, CompatiblePackageNotFoundException): - pass - - package = sputnik.install(about.__title__, about.__version__, about.__default_model__) - - try: - sputnik.package(about.__title__, about.__version__, about.__default_model__) - except (PackageNotFoundException, CompatiblePackageNotFoundException): - print("Model failed to install. Please run 'python -m " - "spacy.en.download --force'.", file=sys.stderr) - sys.exit(1) - - # FIXME clean up old-style packages - migrate(os.path.dirname(os.path.abspath(__file__))) - - print("Model successfully installed.", file=sys.stderr) + download('en', force) if __name__ == '__main__': diff --git a/spacy/util.py b/spacy/util.py index bcc55c656..37d3b7bab 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -23,15 +23,17 @@ def get_package(data_dir): def get_package_by_name(name=None, via=None): try: return sputnik.package(about.__title__, about.__version__, - name or about.__default_model__, data_path=via) + name or about.__models__[about.__default_model__]['package'], + data_path=via) except PackageNotFoundException as e: raise RuntimeError("Model %s not installed. Please run 'python -m " - "spacy.en.download' to install latest compatible " - "model." % name) + "spacy.%s.download' to install latest compatible " + "model." % (name, about.__models__[name]['module'])) except CompatiblePackageNotFoundException as e: - raise RuntimeError("Installed model is not compatible with spaCy " - "version. Please run 'python -m spacy.en.download " - "--force' to install latest compatible model.") + raise RuntimeError("Installed model %s is not compatible with spaCy " + "version. Please run 'python -m spacy.%s.download " + "--force' to install latest compatible model." % + (name, about.__models__[name]['module'])) def normalize_slice(length, start, stop, step=None): From f2cfbfc412f6c7e82afe37083c8a6d181779139f Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Thu, 24 Mar 2016 15:09:55 +0100 Subject: [PATCH 5/9] remove internal redundancy and overhead from StringStore --- spacy/strings.pxd | 1 + spacy/strings.pyx | 84 +++++++++++++++++++++++------------------------ 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 6ba86d2ce..e2cd579c0 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -24,3 +24,4 @@ cdef class StringStore: cdef int64_t _resize_at cdef const Utf8Str* intern(self, unicode py_string) except NULL + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL diff --git a/spacy/strings.pyx b/spacy/strings.pyx index d11936d12..aa1f5c92d 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,30 +1,25 @@ -from __future__ import unicode_literals -import codecs +from __future__ import unicode_literals, absolute_import +cimport cython from libc.string cimport memcpy +from libc.stdint cimport uint64_t + from murmurhash.mrmr cimport hash64 from preshed.maps cimport map_iter, key_t -from cpython cimport PyUnicode_AS_DATA -from cpython cimport PyUnicode_GET_DATA_SIZE - -from libc.stdint cimport int64_t - - -from .typedefs cimport hash_t, attr_t - -try: - import codecs as io -except ImportError: - import io +from .typedefs cimport hash_t import ujson as json cpdef hash_t hash_string(unicode string) except 0: chars = string.encode('utf8') - return hash64(chars, len(chars), 1) + return _hash_utf8(chars, len(chars)) + + +cdef hash_t _hash_utf8(char* utf8_string, int length): + return hash64(utf8_string, length, 1) cdef unicode _decode(const Utf8Str* string): @@ -92,45 +87,43 @@ cdef class StringStore: def __getitem__(self, object string_or_id): cdef bytes byte_string - cdef unicode py_string cdef const Utf8Str* utf8str + cdef unsigned int int_id - cdef int id_ - if isinstance(string_or_id, int) or isinstance(string_or_id, long): - if string_or_id == 0: - return u'' - elif string_or_id < 1 or string_or_id >= self.size: + if isinstance(string_or_id, (int, long)): + try: + int_id = string_or_id + except OverflowError: raise IndexError(string_or_id) - utf8str = &self.c[string_or_id] + if int_id == 0: + return u'' + elif int_id >= self.size: + raise IndexError(string_or_id) + utf8str = &self.c[int_id] return _decode(utf8str) elif isinstance(string_or_id, bytes): - if len(string_or_id) == 0: + byte_string = string_or_id + if len(byte_string) == 0: return 0 - py_string = string_or_id.decode('utf8') - utf8str = self.intern(py_string) + utf8str = self._intern_utf8(byte_string, len(byte_string)) return utf8str - self.c elif isinstance(string_or_id, unicode): - if len(string_or_id) == 0: + if len(string_or_id) == 0: return 0 - py_string = string_or_id - utf8str = self.intern(py_string) + byte_string = (string_or_id).encode('utf8') + utf8str = self._intern_utf8(byte_string, len(byte_string)) return utf8str - self.c else: raise TypeError(type(string_or_id)) def __contains__(self, unicode string): cdef hash_t key = hash_string(string) - value = self._map.get(key) - return True if value is not NULL else False + return self._map.get(key) is not NULL def __iter__(self): cdef int i for i in range(self.size): - if i == 0: - yield u'' - else: - utf8str = &self.c[i] - yield _decode(utf8str) + yield _decode(&self.c[i]) if i > 0 else u'' def __reduce__(self): strings = [""] @@ -142,21 +135,26 @@ cdef class StringStore: cdef const Utf8Str* intern(self, unicode py_string) except NULL: # 0 means missing, but we don't bother offsetting the index. - cdef hash_t key = hash_string(py_string) + cdef bytes byte_string = py_string.encode('utf8') + return self._intern_utf8(byte_string, len(byte_string)) + + @cython.final + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL: + # 0 means missing, but we don't bother offsetting the index. + cdef hash_t key = _hash_utf8(utf8_string, length) value = self._map.get(key) - if value != NULL: + if value is not NULL: return value if self.size == self._resize_at: self._realloc() - cdef bytes byte_string = py_string.encode('utf8') - self.c[self.size] = _allocate(self.mem, byte_string, len(byte_string)) + self.c[self.size] = _allocate(self.mem, utf8_string, length) self._map.set(key, &self.c[self.size]) self.size += 1 return &self.c[self.size-1] def dump(self, file_): - string_data = json.dumps([s for s in self]) + string_data = json.dumps(list(self)) if not isinstance(string_data, unicode): string_data = string_data.decode('utf8') file_.write(string_data) @@ -166,8 +164,10 @@ cdef class StringStore: if strings == ['']: return None cdef unicode string - for string in strings: - if string: + for string in strings: + # explicit None/len check instead of simple truth testing + # (bug in Cython <= 0.23.4) + if string is not None and len(string): self.intern(string) def _realloc(self): From f18805ee1c4c37632cc5cfbc369223cb1fdd4641 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Thu, 24 Mar 2016 15:40:12 +0100 Subject: [PATCH 6/9] make StringStore.__contains__() return True for the empty string (which is also contained in iteration) --- spacy/strings.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index aa1f5c92d..c890cdd22 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -116,7 +116,9 @@ cdef class StringStore: else: raise TypeError(type(string_or_id)) - def __contains__(self, unicode string): + def __contains__(self, unicode string not None): + if len(string) == 0: + return True cdef hash_t key = hash_string(string) return self._map.get(key) is not NULL From b8f63071eb1a8a1523ca91819485a350afd83c14 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 25 Mar 2016 18:54:45 +0100 Subject: [PATCH 7/9] add lang registration facility --- spacy/__init__.py | 21 ++++++++++++--------- spacy/about.py | 14 +++----------- spacy/download.py | 6 +++--- spacy/tokenizer.pyx | 3 +-- spacy/util.py | 32 ++++++++++++++++++++++++-------- spacy/vocab.pyx | 1 - 6 files changed, 43 insertions(+), 34 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index b09ee3491..f47926a63 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,13 +1,16 @@ from . import util -from .about import __models__ -import importlib + +from .en import English +from .de import German +from . import util + + +util.register_lang(English.lang, English) +util.register_lang(German.lang, German) def load(name, vectors=None, via=None): - if name not in __models__: - raise Exception('Model %s not found.' % name) - - mod = importlib.import_module('.%s' % __models__[name]['module'], 'spacy') - return getattr(mod, __models__[name]['class'])( - package=util.get_package_by_name(name, via=via), - vectors_package=util.get_package_by_name(vectors, via=via)) + package = util.get_package_by_name(name, via=via) + vectors_package = util.get_package_by_name(vectors, via=via) + cls = util.get_lang(name) + return cls(package=package, vectors_package=vectors_package) diff --git a/spacy/about.py b/spacy/about.py index eed7c3f81..7f889cad8 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -11,15 +11,7 @@ __author__ = 'Matthew Honnibal' __email__ = 'matt@spacy.io' __license__ = 'MIT' __models__ = { - 'en': { - 'module': 'en', - 'class': 'English', - 'package': 'en>=1.0.0,<1.1.0', - }, - 'de': { - 'module': 'de', - 'class': 'German', - 'package': 'de>=1.0.0,<1.1.0', - }, + 'en': 'en>=1.0.0,<1.1.0', + 'de': 'de>=1.0.0,<1.1.0', } -__default_model__ = 'en' +__default_lang__ = 'en' diff --git a/spacy/download.py b/spacy/download.py index 537c06872..f7fc798ae 100644 --- a/spacy/download.py +++ b/spacy/download.py @@ -14,17 +14,17 @@ def download(lang, force=False): sputnik.purge(about.__title__, about.__version__) try: - sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + sputnik.package(about.__title__, about.__version__, about.__models__[lang]) print("Model already installed. Please run 'python -m " "spacy.%s.download --force' to reinstall." % lang, file=sys.stderr) sys.exit(1) except (PackageNotFoundException, CompatiblePackageNotFoundException): pass - package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]['package']) + package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]) try: - sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + sputnik.package(about.__title__, about.__version__, about.__models__[lang]) except (PackageNotFoundException, CompatiblePackageNotFoundException): print("Model failed to install. Please run 'python -m " "spacy.%s.download --force'." % lang, file=sys.stderr) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f8613fce8..44d627505 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -16,8 +16,7 @@ cimport cython from . import util from .tokens.doc cimport Doc -from .util import read_lang_data -from .util import get_package +from .util import read_lang_data, get_package cdef class Tokenizer: diff --git a/spacy/util.py b/spacy/util.py index 37d3b7bab..4eda2d0e4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -14,6 +14,21 @@ from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE +LANGUAGES = {} + + +def register_lang(name, cls): + global LANGUAGES + LANGUAGES[name] = cls + + +def get_lang(name): + lang = re.split('[^a-zA-Z0-9_]', name, 1)[0] + if lang not in LANGUAGES: + raise RuntimeError('Language not supported: %s' % lang) + return LANGUAGES[lang] + + def get_package(data_dir): if not isinstance(data_dir, six.string_types): raise RuntimeError('data_dir must be a string') @@ -21,19 +36,20 @@ def get_package(data_dir): def get_package_by_name(name=None, via=None): + package_name = name or about.__models__[about.__default_lang__] + lang = get_lang(package_name) try: return sputnik.package(about.__title__, about.__version__, - name or about.__models__[about.__default_model__]['package'], - data_path=via) + package_name, data_path=via) except PackageNotFoundException as e: - raise RuntimeError("Model %s not installed. Please run 'python -m " - "spacy.%s.download' to install latest compatible " - "model." % (name, about.__models__[name]['module'])) + raise RuntimeError("Model '%s' not installed. Please run 'python -m " + "%s.download' to install latest compatible " + "model." % (name, lang.__module__)) except CompatiblePackageNotFoundException as e: - raise RuntimeError("Installed model %s is not compatible with spaCy " - "version. Please run 'python -m spacy.%s.download " + raise RuntimeError("Installed model is not compatible with spaCy " + "version. Please run 'python -m %s.download " "--force' to install latest compatible model." % - (name, about.__models__[name]['module'])) + (lang.__module__)) def normalize_slice(length, start, stop, step=None): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f876bfefb..3712a7383 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -25,7 +25,6 @@ from . import attrs from . import symbols from cymem.cymem cimport Address -from . import util from .serialize.packer cimport Packer from .attrs cimport PROB From db095a162c12d4e68b11543e16ba5a9c47881d23 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 25 Mar 2016 18:59:47 +0100 Subject: [PATCH 8/9] fix --- spacy/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index f47926a63..d01bb11f3 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -2,7 +2,6 @@ from . import util from .en import English from .de import German -from . import util util.register_lang(English.lang, English) From c90d4a6f17aa2940b744863c2491f23637fe0c24 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sat, 26 Mar 2016 11:44:53 +0100 Subject: [PATCH 9/9] relative imports in __init__.py --- spacy/__init__.py | 12 ++++++------ spacy/util.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index d01bb11f3..676659fdd 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,15 +1,15 @@ -from . import util +from .util import set_lang_class, get_lang_class, get_package, get_package_by_name from .en import English from .de import German -util.register_lang(English.lang, English) -util.register_lang(German.lang, German) +set_lang_class(English.lang, English) +set_lang_class(German.lang, German) def load(name, vectors=None, via=None): - package = util.get_package_by_name(name, via=via) - vectors_package = util.get_package_by_name(vectors, via=via) - cls = util.get_lang(name) + package = get_package_by_name(name, via=via) + vectors_package = get_package_by_name(vectors, via=via) + cls = get_lang_class(name) return cls(package=package, vectors_package=vectors_package) diff --git a/spacy/util.py b/spacy/util.py index 4eda2d0e4..b1e93d08b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -17,12 +17,12 @@ from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE LANGUAGES = {} -def register_lang(name, cls): +def set_lang_class(name, cls): global LANGUAGES LANGUAGES[name] = cls -def get_lang(name): +def get_lang_class(name): lang = re.split('[^a-zA-Z0-9_]', name, 1)[0] if lang not in LANGUAGES: raise RuntimeError('Language not supported: %s' % lang) @@ -37,7 +37,7 @@ def get_package(data_dir): def get_package_by_name(name=None, via=None): package_name = name or about.__models__[about.__default_lang__] - lang = get_lang(package_name) + lang = get_lang_class(package_name) try: return sputnik.package(about.__title__, about.__version__, package_name, data_path=via)