From 33dfb4933c3c82c1ed60f7fff9228b44a945817e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Nov 2014 19:53:29 +1100 Subject: [PATCH 01/56] * Remove taggers from Language class. Work on doc strings --- spacy/lang.pyx | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 79a84e936..bc9677e6c 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -23,9 +23,6 @@ from . import util from .util import read_lang_data from .tokens import Tokens -from .tagger cimport Tagger -from .ner.greedy_parser cimport NERParser - cdef class Language: def __init__(self, name): @@ -42,12 +39,6 @@ cdef class Language: self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) self._load_special_tokenization(rules) - if path.exists(path.join(util.DATA_DIR, name, 'pos')): - self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos')) - else: - self.pos_tagger = None - if path.exists(path.join(util.DATA_DIR, name, 'ner')): - self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner')) cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) @@ -244,6 +235,10 @@ cdef class Language: cdef class Lexicon: + '''A map container for a language's Lexeme structs. + + Also interns UTF-8 strings, and maps them to consecutive integer IDs. + ''' def __init__(self): self.mem = Pool() self._dict = PreshMap(2 ** 20) @@ -252,6 +247,7 @@ cdef class Lexicon: self.size = 1 cdef Lexeme* get(self, String* string) except NULL: + '''Retrieve a pointer to a Lexeme from the lexicon.''' cdef Lexeme* lex lex = self._dict.get(string.key) if lex != NULL: @@ -266,6 +262,25 @@ cdef class Lexicon: return lex def __getitem__(self, id_or_string): + '''Retrieve a lexeme, given an int ID or a unicode string. If a previously + unseen unicode string is given, a new Lexeme is created and stored. + + This function relies on Cython's struct-to-dict conversion. Python clients + receive a dict keyed by strings (byte or unicode, depending on Python 2/3), + with int values. Cython clients can instead receive a Lexeme struct value. + More efficient Cython access is provided by Lexicon.get, which returns + a Lexeme*. + + Args: + id_or_string (int or unicode): The integer ID of a word, or its unicode + string. If an int >= Lexicon.size, IndexError is raised. + If id_or_string is neither an int nor a unicode string, ValueError + is raised. + + Returns: + lexeme (dict): A Lexeme struct instance, which Cython translates into + a dict if the operator is called from Python. + ''' if type(id_or_string) == int: return self.lexemes.at(id_or_string)[0] cdef String string From 3430d5f629cc40d48c4e65eb0d4338395f5d94a1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 1 Dec 2014 22:55:13 +1100 Subject: [PATCH 02/56] * Revise intro copy. Add NLTK comparison --- docs/source/index.rst | 167 +++++++++++++++++++++++++++++++++++------- 1 file changed, 140 insertions(+), 27 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 97681bfd8..dbadd9fc3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,45 +3,158 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. +================================ spaCy NLP Tokenizer and Lexicon ================================ -spaCy is a library for industrial strength NLP in Python. Its core -values are: +spaCy is a library for industrial-strength NLP in Python and Cython. It +assumes that NLP is mostly about solving machine learning problems, and that +solving these problems is mostly about feature extraction. So, spaCy helps you +do feature extraction --- it helps you represent a linguistic context as +a vector of numbers. It's also a great way to create an inverted index, +particularly if you want to index documents on fancier properties. -* **Efficiency**: You won't find faster NLP tools. For shallow analysis, it's 10x - faster than Stanford Core NLP, and over 200x faster than NLTK. Its parser is - over 100x faster than Stanford's. +For commercial users, a trial license costs $0, with a one-time license fee of +$1,000 to use spaCy in production. For non-commercial users, a GPL license is +available. To quickly get the gist of the license terms, check out the license +user stories. -* **Accuracy**: All spaCy tools are within 0.5% of the current published - state-of-the-art, on both news and web text. NLP moves fast, so always check - the numbers --- and don't settle for tools that aren't backed by - rigorous recent evaluation. -* **Minimalism**: This isn't a library that covers 43 known algorithms to do X. You - get 1 --- the best one --- with a simple, low-level interface. This keeps the - code-base small and concrete. Our Python APIs use lists and - dictionaries, and our C/Cython APIs use arrays and simple structs. +Unique Lexicon-centric design +============================= + +spaCy takes care of all string-processing, efficiently and accurately. This +makes a night-and-day difference to your feature extraction code. +Instead of a list of strings, spaCy's tokenizer gives you references to feature-rich +lexeme objects: + + >>> from spacy.en import EN + >>> from spacy.feature_names import SIC, NORM, SHAPE, ASCIIED, PREFIX, SUFFIX, \ + LENGTH, CLUSTER, POS_TYPE, SENSE_TYPE, \ + IS_ALPHA, IS_ASCII, IS_DIGIT, IS_PUNCT, IS_SPACE, IS_TITLE, IS_UPPER, \ + LIKE_URL, LIKE_NUMBER + >>> feats = ( + SIC, # ID of the original word form + NORM, # ID of the normalized word form + CLUSTER, # ID of the word's Brown cluster + IS_TITLE, # Was the word title-cased? + POS_TYPE # A cluster ID describing what POS tags the word is usually assigned + ) + >>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^') + >>> tokens.to_strings() + [u'Split', u'words', u',', u'punctuation', u',', u'emoticons', u'etc.', u'!', u'^_^'] + >>> tokens.to_array(feats)[:5] + array([[ 1, 2, 3, 4], + [...], + [...], + [...]]) + + +spaCy is designed to **make the right thing easy**, where the right thing is to: + +* **Use rich distributional and orthographic features**. Without these, your model + will be very brittle and domain dependent. + +* **Compute features per type, not per token**. Because of Zipf's law, you can + expect this to be exponentially more efficient. + +* **Minimize string processing**, and instead compute with arrays of ID ints. -Comparison ----------- +Comparison with NLTK +==================== -+----------------+-------------+--------+---------------+--------------+ -| Tokenize & Tag | Speed (w/s) | Memory | % Acc. (news) | % Acc. (web) | -+----------------+-------------+--------+---------------+--------------+ -| spaCy | 107,000 | 1.3gb | 96.7 | | -+----------------+-------------+--------+---------------+--------------+ -| Stanford | 8,000 | 1.5gb | 96.7 | | -+----------------+-------------+--------+---------------+--------------+ -| NLTK | 543 | 61mb | 94.0 | | -+----------------+-------------+--------+---------------+--------------+ +`NLTK `_ provides interfaces to a wide-variety of NLP +tools and resources, and its own implementations of a few algorithms. It comes +with comprehensive documentation, and a book introducing concepts in NLP. For +these reasons, it's very widely known. However, if you're trying to make money +or do cutting-edge research, NLTK is not a good choice. + +The `list of stuff in NLTK `_ looks impressive, +but almost none of it is useful for real work. You're not going to make any money, +or do top research, by using the NLTK chat bots, theorem provers, toy CCG implementation, +etc. Most of NLTK is there to assist in the explanation ideas in computational +linguistics, at roughly an undergraduate level. +But it also claims to support serious work, by wrapping external tools. + +In a pretty well known essay, Joel Spolsky discusses the pain of dealing with +`leaky abstractions `_. +An abstraction tells you to not care about implementation +details, but sometimes the implementation matters after all. When it +does, you have to waste time revising your assumptions. + +NLTK's wrappers call external tools via subprocesses, and wrap this up so +that it looks like a native API. This abstraction leaks *a lot*. The system +calls impose far more overhead than a normal Python function call, which makes +the most natural way to program against the API infeasible. + + +Case study: POS tagging +----------------------- + +Here's a quick comparison of the following POS taggers: + +* **Stanford (CLI)**: The Stanford POS tagger, invoked once as a batch process + from the command-line; +* **nltk.tag.stanford**: The Stanford tagger, invoked document-by-document via + NLTK's wrapper; +* **nltk.pos_tag**: NLTK's own POS tagger, invoked document-by-document. +* **spacy.en.pos_tag**: spaCy's POS tagger, invoked document-by-document. + + ++-------------------+-------------+--------+ +| System | Speed (w/s) | % Acc. | ++-------------------+-------------+--------+ +| spaCy | 107,000 | 96.7 | ++-------------------+-------------+--------+ +| Stanford (CLI) | 8,000 | 96.7 | ++-------------------+-------------+--------+ +| nltk.pos_tag | 543 | 94.0 | ++-------------------+-------------+--------+ +| nltk.tag.stanford | 209 | 96.7 | ++-------------------+-------------+--------+ + +Experimental details here. Three things are apparent from this comparison: + +1. The native NLTK tagger, nltk.pos_tag, is both slow and inaccurate; + +2. Calling the Stanford tagger document-by-document via NLTK is **40x** slower + than invoking the model once as a batch process, via the command-line; + +3. spaCy is over 10x faster than the Stanford tagger, even when called + **sentence-by-sentence**. + +The problem is that NLTK simply wraps the command-line +interfaces of these tools, so communication is via a subprocess. NLTK does not +even hold open a pipe for you --- the model is reloaded, again and again. + +To use the wrapper effectively, you should batch up your text as much as possible. +This probably isn't how you would like to structure your pipeline, and you +might not be able to batch up much text at all, e.g. if serving a single +request means processing a single document. +Technically, NLTK does give you Python functions to access lots of different +systems --- but, you can't use them as you would expect to use a normal Python +function. The abstraction leaks. + +Here's the bottom-line: the Stanford tools are written in Java, so using them +from Python sucks. You shouldn't settle for this. It's a problem that springs +purely from the tooling, rather than the domain. + +Summary +------- + +NLTK is a well-known Python library for NLP, but for the important bits, you +don't get actual Python modules. You get wrappers which throw to external +tools, via subprocesses. This is not at all the same thing. + +spaCy is implemented in Cython, just like numpy, scikit-learn, lxml and other +high-performance Python libraries. So you get a native Python API, but the +performance you expect from a program written in C. .. toctree:: :hidden: :maxdepth: 3 + + features.rst - what/index.rst - why/index.rst - how/index.rst From ea19850a69291df678aefc4bab4deb7a143aa42b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Dec 2014 04:39:12 +1100 Subject: [PATCH 03/56] * Add tokenizer section --- docs/source/index.rst | 74 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index dbadd9fc3..b0dd08417 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -8,11 +8,11 @@ spaCy NLP Tokenizer and Lexicon ================================ spaCy is a library for industrial-strength NLP in Python and Cython. It -assumes that NLP is mostly about solving machine learning problems, and that +assumes that NLP is mostly about solving large machine learning problems, and that solving these problems is mostly about feature extraction. So, spaCy helps you -do feature extraction --- it helps you represent a linguistic context as -a vector of numbers. It's also a great way to create an inverted index, -particularly if you want to index documents on fancier properties. +do feature extraction --- it includes an excellent set of distributional and +orthographic features, memoizes them efficiently, and maps strings to +consecutive integer values. For commercial users, a trial license costs $0, with a one-time license fee of $1,000 to use spaCy in production. For non-commercial users, a GPL license is @@ -20,6 +20,70 @@ available. To quickly get the gist of the license terms, check out the license user stories. +Tokenization done right +======================= + +Most tokenizers rely on complicated regular expressions. Often, they leave you +with no way to align the tokens back to the original string --- a vital feature +if you want to display some mark-up, such as spelling correction. The regular +expressions also interact, making it hard to accommodate special cases. + +spaCy introduces a **novel tokenization algorithm** that's much faster and much +more flexible: + +.. code-block:: python + + def tokenize(string, prefixes={}, suffixes={}, specials={}): + '''Sketch of spaCy's tokenization algorithm.''' + tokens = [] + cache = {} + for chunk in string.split(): + # Because of Zipf's law, the cache serves the majority of "chunks". + if chunk in cache: + tokens.extend(cache[chunl]) + continue + key = chunk + + subtokens = [] + # Process a chunk by splitting off prefixes e.g. ( " { and suffixes e.g. , . : + # If we split one off, check whether we're left with a special-case, + # e.g. contractions (can't, won't, etc), emoticons, abbreviations, etc. + # This makes the tokenization easy to update and customize. + while chunk: + prefix, chunk = _consume_prefix(chunk, prefixes) + if prefix: + subtokens.append(prefix) + if chunk in specials: + subtokens.extend(specials[chunk]) + break + suffix, chunk = _consume_suffix(chunk, suffixes) + if suffix: + subtokens.append(suffix) + if chunk in specials: + subtokens.extend(specials[chunk]) + break + cache[key] = subtokens + +Your data is going to have its own quirks, so it's really useful to have +a tokenizer you can easily control. To see the limitations of the standard +regex-based approach, check out `CMU's recent work on tokenizing tweets `_. Despite a lot of careful attention, they can't handle all of their +known emoticons correctly --- doing so would interfere with the way they +process other punctuation. This isn't a problem for spaCy: we just add them +all to the special tokenization rules. + +spaCy's tokenizer is also incredibly efficient: + ++--------+---------------+--------------+ +| System | Tokens/second | Speed Factor | ++--------+---------------+--------------+ +| NLTK | 89 000 | 1.00 | ++--------+---------------+--------------+ +| spaCy | 3 093 000 | 38.30 | ++--------+---------------+--------------+ + +spaCy can create an inverted index of the 1.8 billion word Gigaword corpus, +keyed by lemmas, in under half an hour --- on a Macbook Air. + Unique Lexicon-centric design ============================= @@ -114,7 +178,7 @@ Here's a quick comparison of the following POS taggers: | nltk.tag.stanford | 209 | 96.7 | +-------------------+-------------+--------+ -Experimental details here. Three things are apparent from this comparison: +Experimental details TODO. Three things are apparent from this comparison: 1. The native NLTK tagger, nltk.pos_tag, is both slow and inaccurate; From 2ee8a1e61fcf295b83129cee5ec3f402239fd911 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Dec 2014 15:20:18 +1100 Subject: [PATCH 04/56] * Make intro chattier, explain philosophy better --- docs/source/index.rst | 106 +++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index b0dd08417..808455fd0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -7,19 +7,59 @@ spaCy NLP Tokenizer and Lexicon ================================ -spaCy is a library for industrial-strength NLP in Python and Cython. It -assumes that NLP is mostly about solving large machine learning problems, and that -solving these problems is mostly about feature extraction. So, spaCy helps you -do feature extraction --- it includes an excellent set of distributional and -orthographic features, memoizes them efficiently, and maps strings to -consecutive integer values. +spaCy is a library for industrial-strength NLP in Python and Cython. spaCy's +take on NLP is that it's mostly about feature extraction --- that's the part +that's specific to NLP, so that's what an NLP library should focus on. +It should tell you what the current best-practice is, and help you do exactly +that, quickly and efficiently. -For commercial users, a trial license costs $0, with a one-time license fee of -$1,000 to use spaCy in production. For non-commercial users, a GPL license is -available. To quickly get the gist of the license terms, check out the license -user stories. +Best-practice is to **use lots of large lexicons**. Let's say you hit the word +*belieber* in production. What will your system know about this word? A bad +system will only know things about the words in its training corpus, which +probably consists of texts written before Justin Bieber was even born. +It doesn't have to be like that. +Unique Lexicon-centric design +============================= + +spaCy helps you build models that generalise better, by making it easy to use +more robust features. Instead of a list of strings, the tokenizer returns +references to rich lexical types. Its tokenizer returns sequence of references +to rich lexical types. Features which ask about the word's Brown cluster, its +typical part-of-speech tag, how it's usually cased etc require no extra effort: + + >>> from spacy.en import EN + >>> from spacy.feature_names import * + >>> feats = ( + SIC, # ID of the original word form + NORM, # ID of the normalized word form + CLUSTER, # ID of the word's Brown cluster + IS_TITLE, # Was the word title-cased? + POS_TYPE # A cluster ID describing what POS tags the word is usually assigned + ) + >>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^') + >>> tokens.to_array(feats)[:5] + array([[ 1, 2, 3, 4], + [...], + [...], + [...]]) + + +spaCy is designed to **make the right thing easy**, where the right thing is to: + +* **Use rich distributional and orthographic features**. Without these, your model + will be very brittle and domain dependent. + +* **Compute features per type, not per token**. Because of Zipf's law, you can + expect this to be exponentially more efficient. + +* **Minimize string processing**, and instead compute with arrays of ID ints. + +For the current list of lexical features, see `Lexical Features`_. + +.. _lexical features: features.html + Tokenization done right ======================= @@ -82,48 +122,10 @@ spaCy's tokenizer is also incredibly efficient: +--------+---------------+--------------+ spaCy can create an inverted index of the 1.8 billion word Gigaword corpus, -keyed by lemmas, in under half an hour --- on a Macbook Air. +in under half an hour --- on a Macbook Air. See the `inverted +index tutorial`_. -Unique Lexicon-centric design -============================= - -spaCy takes care of all string-processing, efficiently and accurately. This -makes a night-and-day difference to your feature extraction code. -Instead of a list of strings, spaCy's tokenizer gives you references to feature-rich -lexeme objects: - - >>> from spacy.en import EN - >>> from spacy.feature_names import SIC, NORM, SHAPE, ASCIIED, PREFIX, SUFFIX, \ - LENGTH, CLUSTER, POS_TYPE, SENSE_TYPE, \ - IS_ALPHA, IS_ASCII, IS_DIGIT, IS_PUNCT, IS_SPACE, IS_TITLE, IS_UPPER, \ - LIKE_URL, LIKE_NUMBER - >>> feats = ( - SIC, # ID of the original word form - NORM, # ID of the normalized word form - CLUSTER, # ID of the word's Brown cluster - IS_TITLE, # Was the word title-cased? - POS_TYPE # A cluster ID describing what POS tags the word is usually assigned - ) - >>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^') - >>> tokens.to_strings() - [u'Split', u'words', u',', u'punctuation', u',', u'emoticons', u'etc.', u'!', u'^_^'] - >>> tokens.to_array(feats)[:5] - array([[ 1, 2, 3, 4], - [...], - [...], - [...]]) - - -spaCy is designed to **make the right thing easy**, where the right thing is to: - -* **Use rich distributional and orthographic features**. Without these, your model - will be very brittle and domain dependent. - -* **Compute features per type, not per token**. Because of Zipf's law, you can - expect this to be exponentially more efficient. - -* **Minimize string processing**, and instead compute with arrays of ID ints. - +.. _inverted index tutorial: index_tutorial.html Comparison with NLTK ==================== @@ -221,4 +223,4 @@ performance you expect from a program written in C. :maxdepth: 3 features.rst - + license_stories.rst From 8c2938fe0157af0368a7eaf2671106702232e018 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Dec 2014 23:46:59 +1100 Subject: [PATCH 05/56] * Rename Lexicon._dict to Lexicon._map --- spacy/lang.pxd | 15 +++------------ spacy/lang.pyx | 14 +++++++------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 68f1ee58a..dc3262771 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -1,5 +1,7 @@ from libcpp.vector cimport vector +from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER + from preshed.maps cimport PreshMap from cymem.cymem cimport Pool @@ -7,17 +9,9 @@ from .typedefs cimport hash_t from .tokens cimport Tokens from .lexeme cimport Lexeme from .tagger cimport Tagger -from .ner.greedy_parser cimport NERParser from .utf8string cimport StringStore -cdef extern from "Python.h": - cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch) - cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch) - cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch) - cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch) - - cdef struct String: Py_UNICODE* chars size_t n @@ -32,7 +26,7 @@ cdef class Lexicon: cdef Lexeme* get(self, String* s) except NULL - cdef PreshMap _dict + cdef PreshMap _map cdef class Language: @@ -42,9 +36,6 @@ cdef class Language: cdef PreshMap _specials cpdef readonly Lexicon lexicon - cpdef readonly Tagger pos_tagger - cpdef readonly NERParser ner_tagger - cdef object _prefix_re cdef object _suffix_re cdef object _infix_re diff --git a/spacy/lang.pyx b/spacy/lang.pyx index bc9677e6c..df9cf3166 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -241,7 +241,7 @@ cdef class Lexicon: ''' def __init__(self): self.mem = Pool() - self._dict = PreshMap(2 ** 20) + self._map = PreshMap(2 ** 20) self.strings = StringStore() self.lexemes.push_back(&EMPTY_LEXEME) self.size = 1 @@ -249,12 +249,12 @@ cdef class Lexicon: cdef Lexeme* get(self, String* string) except NULL: '''Retrieve a pointer to a Lexeme from the lexicon.''' cdef Lexeme* lex - lex = self._dict.get(string.key) + lex = self._map.get(string.key) if lex != NULL: return lex lex = self.mem.alloc(sizeof(Lexeme), 1) lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {}) - self._dict.set(string.key, lex) + self._map.set(string.key, lex) while self.lexemes.size() < (lex.id + 1): self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes[lex.id] = lex @@ -302,11 +302,11 @@ cdef class Lexicon: assert fp != NULL cdef size_t st cdef hash_t key - for i in range(self._dict.length): - key = self._dict.c_map.cells[i].key + for i in range(self._map.length): + key = self._map.c_map.cells[i].key if key == 0: continue - lexeme = self._dict.c_map.cells[i].value + lexeme = self._map.c_map.cells[i].value st = fwrite(&key, sizeof(key), 1, fp) assert st == 1 st = fwrite(lexeme, sizeof(Lexeme), 1, fp) @@ -331,7 +331,7 @@ cdef class Lexicon: st = fread(lexeme, sizeof(Lexeme), 1, fp) if st != 1: break - self._dict.set(key, lexeme) + self._map.set(key, lexeme) while self.lexemes.size() < (lexeme.id + 1): self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes[lexeme.id] = lexeme From 522bb0346e038c475c16f94109f302ba0df3c2bb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Dec 2014 23:48:05 +1100 Subject: [PATCH 06/56] * Work on get_array method of Tokens --- spacy/tokens.pxd | 5 +++++ spacy/tokens.pyx | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index d1b2ef10b..36dee698e 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -1,3 +1,6 @@ +import numpy as np +cimport numpy as np + from cymem.cymem cimport Pool from .lexeme cimport Lexeme @@ -28,6 +31,8 @@ cdef class Tokens: cdef int push_back(self, int i, Lexeme* lexeme) except -1 cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1 + cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features) + cdef class Token: cdef StringStore _string_store diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 721e6bb80..ba8812f2e 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -102,6 +102,16 @@ cdef class Tokens: elif tag_type == ENTITY: self.ner[i] = tag + cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features): + cdef int i, j + cdef np.ndarray[atom_t, ndim=2] output + output = np.ndarray(shape=(self.length, len(features)), dtype=int) + for i in range(self.length): + for j, feature in enumerate(features): + output[i, j] = self.lex[i].sic + #output[i, j] = lexeme_get_feature(self.lex[i], feature) + return output + def _realloc(self, new_size): self.max_length = new_size n = new_size + (PADDING * 2) From 14097311ae77afdbe46dc859b23a6da9bf61a124 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Dec 2014 01:33:20 +1100 Subject: [PATCH 07/56] * Make StringStore.__getitem__ accept unicode-typed keys. --- spacy/utf8string.pxd | 2 +- spacy/utf8string.pyx | 16 ++++++++++------ tests/test_intern.py | 8 ++++++-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd index 82ae50022..16488b899 100644 --- a/spacy/utf8string.pxd +++ b/spacy/utf8string.pxd @@ -13,7 +13,7 @@ cdef struct Utf8Str: cdef class StringStore: cdef Pool mem - cdef PreshMap table + cdef PreshMap _map cdef Utf8Str* strings cdef int size cdef int _resize_at diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx index 18d4a4e5e..426b531f4 100644 --- a/spacy/utf8string.pyx +++ b/spacy/utf8string.pyx @@ -8,7 +8,7 @@ SEPARATOR = '\n|-SEP-|\n' cdef class StringStore: def __init__(self): self.mem = Pool() - self.table = PreshMap() + self._map = PreshMap() self._resize_at = 10000 self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.size = 1 @@ -17,17 +17,21 @@ cdef class StringStore: def __get__(self): return self.size-1 - def __getitem__(self, string_or_id): + def __getitem__(self, object string_or_id): cdef bytes byte_string cdef Utf8Str* utf8str - if type(string_or_id) == int or type(string_or_id) == long: + if isinstance(string_or_id, int): if string_or_id < 1 or string_or_id >= self.size: raise IndexError(string_or_id) utf8str = &self.strings[string_or_id] return utf8str.chars[:utf8str.length] - elif type(string_or_id) == bytes: + elif isinstance(string_or_id, bytes): utf8str = self.intern(string_or_id, len(string_or_id)) return utf8str.i + elif isinstance(string_or_id, unicode): + byte_string = string_or_id.encode('utf8') + utf8str = self.intern(byte_string, len(byte_string)) + return utf8str.i else: raise TypeError(type(string_or_id)) @@ -36,7 +40,7 @@ cdef class StringStore: # slot 0 to simplify the code, because it doesn't matter. assert length != 0 cdef hash_t key = hash64(chars, length * sizeof(char), 0) - cdef void* value = self.table.get(key) + cdef void* value = self._map.get(key) cdef size_t i if value == NULL: if self.size == self._resize_at: @@ -48,7 +52,7 @@ cdef class StringStore: self.strings[i].chars = self.mem.alloc(length, sizeof(char)) memcpy(self.strings[i].chars, chars, length) self.strings[i].length = length - self.table.set(key, self.size) + self._map.set(key, self.size) self.size += 1 else: i = value diff --git a/tests/test_intern.py b/tests/test_intern.py index 63b4b3433..a7a801b05 100644 --- a/tests/test_intern.py +++ b/tests/test_intern.py @@ -19,8 +19,12 @@ def test_save_bytes(sstore): def test_save_unicode(sstore): - with pytest.raises(TypeError): - A_i = sstore['A'] + Hello_i = sstore[u'Hello'] + assert Hello_i == 1 + assert sstore[u'Hello'] == 1 + assert sstore[u'goodbye'] != Hello_i + assert sstore[u'hello'] != Hello_i + assert Hello_i == 1 def test_zero_id(sstore): From 71b009e3232d923b8719a21d57ddc6ce3ba63c5c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Dec 2014 11:02:24 +1100 Subject: [PATCH 08/56] * Fix bug in refactored StringStore.__getitem__ --- spacy/utf8string.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx index 426b531f4..0384a150c 100644 --- a/spacy/utf8string.pyx +++ b/spacy/utf8string.pyx @@ -5,6 +5,7 @@ import codecs SEPARATOR = '\n|-SEP-|\n' + cdef class StringStore: def __init__(self): self.mem = Pool() @@ -20,7 +21,7 @@ cdef class StringStore: def __getitem__(self, object string_or_id): cdef bytes byte_string cdef Utf8Str* utf8str - if isinstance(string_or_id, int): + if isinstance(string_or_id, int) or isinstance(string_or_id, long): if string_or_id < 1 or string_or_id >= self.size: raise IndexError(string_or_id) utf8str = &self.strings[string_or_id] From b463a7eb8632663fded2d21591b36d78e00d8242 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Dec 2014 11:04:00 +1100 Subject: [PATCH 09/56] * Make flag-setting a language-specific thing --- spacy/en.pxd | 26 +++++++++ spacy/en.pyx | 17 +++++- spacy/lang.pxd | 18 ++---- spacy/lang.pyx | 79 ++++++++++++-------------- spacy/lexeme.pxd | 142 +++++++++++++++++++++++++++++++++-------------- spacy/lexeme.pyx | 88 +++++++++++++---------------- 6 files changed, 224 insertions(+), 146 deletions(-) diff --git a/spacy/en.pxd b/spacy/en.pxd index a7c643eba..cccfb60a8 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -1,6 +1,32 @@ from spacy.lang cimport Language from spacy.tokens cimport Tokens +# Flags +cpdef enum FlagID: + IS_ALPHA + IS_ASCII + IS_DIGIT + IS_LOWER + IS_PUNCT + IS_SPACE + IS_TITLE + IS_UPPER + + LIKE_URL + LIKE_NUMBER + + OFT_LOWER + OFT_TITLE + OFT_UPPER + + IN_MALES + IN_FEMALES + IN_SURNAMES + IN_PLACES + IN_GAMES + IN_CELEBS + IN_NAMES + cdef class English(Language): pass diff --git a/spacy/en.pyx b/spacy/en.pyx index 95c1cbd94..92be97aad 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -38,6 +38,8 @@ provides a fully Penn Treebank 3-compliant tokenizer. from __future__ import unicode_literals cimport lang +from .typedefs cimport flags_t +import orth cdef class English(Language): @@ -47,7 +49,20 @@ cdef class English(Language): name (unicode): The two letter code used by Wikipedia for the language. lexicon (Lexicon): The lexicon. Exposes the lookup method. """ - pass + def set_flags(self, unicode string): + cdef flags_t flags = 0 + flags |= orth.is_alpha(string) << IS_ALPHA + flags |= orth.is_ascii(string) << IS_ASCII + flags |= orth.is_digit(string) << IS_DIGIT + flags |= orth.is_lower(string) << IS_LOWER + flags |= orth.is_punct(string) << IS_PUNCT + flags |= orth.is_space(string) << IS_SPACE + flags |= orth.is_title(string) << IS_TITLE + flags |= orth.is_upper(string) << IS_UPPER + + flags |= orth.like_url(string) << LIKE_URL + flags |= orth.like_number(string) << LIKE_NUMBER + return flags EN = English('en') diff --git a/spacy/lang.pxd b/spacy/lang.pxd index dc3262771..9e4bc7b5d 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -8,23 +8,17 @@ from cymem.cymem cimport Pool from .typedefs cimport hash_t from .tokens cimport Tokens from .lexeme cimport Lexeme -from .tagger cimport Tagger -from .utf8string cimport StringStore - - -cdef struct String: - Py_UNICODE* chars - size_t n - hash_t key +from .utf8string cimport StringStore, UniStr cdef class Lexicon: + cpdef public set_flags cdef Pool mem cpdef readonly size_t size cpdef readonly StringStore strings cdef vector[Lexeme*] lexemes - cdef Lexeme* get(self, String* s) except NULL + cdef Lexeme* get(self, UniStr* s) except NULL cdef PreshMap _map @@ -43,10 +37,10 @@ cdef class Language: cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokenize(self, unicode text) - cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 - cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, + cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 + cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except NULL - cdef int _attach_tokens(self, Tokens tokens, int idx, String* string, + cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index df9cf3166..2a284b9df 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -19,6 +19,8 @@ from .lexeme cimport Lexeme from .lexeme cimport EMPTY_LEXEME from .lexeme cimport init as lexeme_init +from .utf8string cimport slice_unicode + from . import util from .util import read_lang_data from .tokens import Tokens @@ -34,7 +36,7 @@ cdef class Language: self._prefix_re = re.compile(prefix) self._suffix_re = re.compile(suffix) self._infix_re = re.compile(infix) - self.lexicon = Lexicon() + self.lexicon = Lexicon(self.set_flags) if path.exists(path.join(util.DATA_DIR, name, 'lexemes')): self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) @@ -45,11 +47,11 @@ cdef class Language: cdef Tokens tokens = Tokens(self.lexicon.strings, length) if length == 0: return tokens - cdef String string_struct + cdef UniStr string_struct cdef unicode py_string cdef int idx = 0 for i, py_string in enumerate(strings): - string_from_unicode(&string_struct, py_string) + slice_unicode(&string_struct, py_string, 0, len(py_string)) tokens.push_back(idx, self.lexicon.get(&string_struct)) idx += len(py_string) + 1 return tokens @@ -77,11 +79,11 @@ cdef class Language: cdef int start = 0 cdef Py_UNICODE* chars = string cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) - cdef String span + cdef UniStr span for i in range(1, length): if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if start < i: - string_slice(&span, chars, start, i) + slice_unicode(&span, chars, start, i) lexemes = self._cache.get(span.key) if lexemes != NULL: tokens.extend(start, lexemes, 0) @@ -93,7 +95,7 @@ cdef class Language: start += 1 i += 1 if start < i: - string_slice(&span, chars, start, i) + slice_unicode(&span, chars, start, i) lexemes = self._cache.get(span.key) if lexemes != NULL: tokens.extend(start, lexemes, 0) @@ -101,7 +103,7 @@ cdef class Language: self._tokenize(tokens, &span, start, i) return tokens - cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: + cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: cdef vector[Lexeme*] prefixes cdef vector[Lexeme*] suffixes cdef hash_t orig_key @@ -112,20 +114,20 @@ cdef class Language: self._attach_tokens(tokens, start, span, &prefixes, &suffixes) self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) - cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, + cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except NULL: cdef size_t i - cdef String prefix - cdef String suffix - cdef String minus_pre - cdef String minus_suf + cdef UniStr prefix + cdef UniStr suffix + cdef UniStr minus_pre + cdef UniStr minus_suf cdef size_t last_size = 0 while string.n != 0 and string.n != last_size: last_size = string.n pre_len = self._find_prefix(string.chars, string.n) if pre_len != 0: - string_slice(&prefix, string.chars, 0, pre_len) - string_slice(&minus_pre, string.chars, pre_len, string.n) + slice_unicode(&prefix, string.chars, 0, pre_len) + slice_unicode(&minus_pre, string.chars, pre_len, string.n) # Check whether we've hit a special-case if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL: string[0] = minus_pre @@ -133,15 +135,15 @@ cdef class Language: break suf_len = self._find_suffix(string.chars, string.n) if suf_len != 0: - string_slice(&suffix, string.chars, string.n - suf_len, string.n) - string_slice(&minus_suf, string.chars, 0, string.n - suf_len) + slice_unicode(&suffix, string.chars, string.n - suf_len, string.n) + slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len) # Check whether we've hit a special-case if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL: string[0] = minus_suf suffixes.push_back(self.lexicon.get(&suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= string.n: - string_slice(string, string.chars, pre_len, string.n - suf_len) + slice_unicode(string, string.chars, pre_len, string.n - suf_len) prefixes.push_back(self.lexicon.get(&prefix)) suffixes.push_back(self.lexicon.get(&suffix)) elif pre_len: @@ -155,13 +157,13 @@ cdef class Language: return string cdef int _attach_tokens(self, Tokens tokens, - int idx, String* string, + int idx, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1: cdef int split cdef Lexeme** lexemes cdef Lexeme* lexeme - cdef String span + cdef UniStr span if prefixes.size(): idx = tokens.extend(idx, prefixes.data(), prefixes.size()) if string.n != 0: @@ -174,11 +176,11 @@ cdef class Language: if split == 0 or split == -1: idx = tokens.push_back(idx, self.lexicon.get(string)) else: - string_slice(&span, string.chars, 0, split) + slice_unicode(&span, string.chars, 0, split) idx = tokens.push_back(idx, self.lexicon.get(&span)) - string_slice(&span, string.chars, split, split+1) + slice_unicode(&span, string.chars, split, split+1) idx = tokens.push_back(idx, self.lexicon.get(&span)) - string_slice(&span, string.chars, split + 1, string.n) + slice_unicode(&span, string.chars, split + 1, string.n) idx = tokens.push_back(idx, self.lexicon.get(&span)) cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): @@ -222,14 +224,14 @@ cdef class Language: ''' cdef Lexeme** lexemes cdef hash_t hashed - cdef String string + cdef UniStr string for uni_string, substrings in token_rules: lexemes = self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) for i, substring in enumerate(substrings): - string_from_unicode(&string, substring) + slice_unicode(&string, substring, 0, len(substring)) lexemes[i] = self.lexicon.get(&string) lexemes[i + 1] = NULL - string_from_unicode(&string, uni_string) + slice_unicode(&string, uni_string, 0, len(uni_string)) self._specials.set(string.key, lexemes) self._cache.set(string.key, lexemes) @@ -239,21 +241,23 @@ cdef class Lexicon: Also interns UTF-8 strings, and maps them to consecutive integer IDs. ''' - def __init__(self): + def __init__(self, object set_flags=None): self.mem = Pool() self._map = PreshMap(2 ** 20) self.strings = StringStore() self.lexemes.push_back(&EMPTY_LEXEME) self.size = 1 + self.set_flags = set_flags - cdef Lexeme* get(self, String* string) except NULL: + cdef Lexeme* get(self, UniStr* string) except NULL: '''Retrieve a pointer to a Lexeme from the lexicon.''' cdef Lexeme* lex lex = self._map.get(string.key) if lex != NULL: return lex lex = self.mem.alloc(sizeof(Lexeme), 1) - lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {}) + lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, + self.strings, {'flags': self.set_flags(string.chars[:string.n])}) self._map.set(string.key, lex) while self.lexemes.size() < (lex.id + 1): self.lexemes.push_back(&EMPTY_LEXEME) @@ -283,14 +287,14 @@ cdef class Lexicon: ''' if type(id_or_string) == int: return self.lexemes.at(id_or_string)[0] - cdef String string - string_from_unicode(&string, id_or_string) + cdef UniStr string + slice_unicode(&string, id_or_string, 0, len(id_or_string)) cdef Lexeme* lexeme = self.get(&string) return lexeme[0] def __setitem__(self, unicode uni_string, dict props): - cdef String s - string_from_unicode(&s, uni_string) + cdef UniStr s + slice_unicode(&s, uni_string, 0, len(uni_string)) cdef Lexeme* lex = self.get(&s) lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) @@ -338,14 +342,3 @@ cdef class Lexicon: i += 1 self.size += 1 fclose(fp) - - -cdef void string_from_unicode(String* s, unicode uni): - cdef Py_UNICODE* c_uni = uni - string_slice(s, c_uni, 0, len(uni)) - - -cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil: - s.chars = &chars[start] - s.n = end - start - s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 0d7d206e5..9d5dddd6d 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,61 +1,119 @@ -from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t +from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t from .utf8string cimport StringStore -from libc.stdint cimport uint16_t -cpdef flag_t OOV_DIST_FLAGS -# Flags -cpdef enum: - IS_ALPHA - IS_ASCII - IS_DIGIT - IS_LOWER - IS_PUNCT - IS_SPACE - IS_TITLE - IS_UPPER +# Reserve 64 values for flag features +cpdef enum attr_id_t: + FLAG0 + FLAG1 + FLAG2 + FLAG3 + FLAG4 + FLAG5 + FLAG6 + FLAG7 + FLAG8 + FLAG9 + FLAG10 + FLAG11 + FLAG12 + FLAG13 + FLAG14 + FLAG15 + FLAG16 + FLAG17 + FLAG18 + FLAG19 + FLAG20 + FLAG21 + FLAG22 + FLAG23 + FLAG24 + FLAG25 + FLAG26 + FLAG27 + FLAG28 + FLAG29 + FLAG30 + FLAG31 + FLAG32 + FLAG33 + FLAG34 + FLAG35 + FLAG36 + FLAG37 + FLAG38 + FLAG39 + FLAG40 + FLAG41 + FLAG42 + FLAG43 + FLAG44 + FLAG45 + FLAG46 + FLAG47 + FLAG48 + FLAG49 + FLAG50 + FLAG51 + FLAG52 + FLAG53 + FLAG54 + FLAG55 + FLAG56 + FLAG57 + FLAG58 + FLAG59 + FLAG60 + FLAG61 + FLAG62 + FLAG63 - LIKE_URL - LIKE_NUMBER + ID + SIC + NORM + SHAPE + ASCIIED + PREFIX + SUFFIX - OFT_LOWER - OFT_TITLE - OFT_UPPER - - IN_MALES - IN_FEMALES - IN_SURNAMES - IN_PLACES - IN_GAMES - IN_CELEBS - IN_NAMES + LENGTH + CLUSTER + POS_TYPE + SENSE_TYPE cdef struct Lexeme: - flag_t flags + flags_t flags - id_t id - id_t sic - id_t norm - id_t shape - id_t asciied - id_t prefix - id_t suffix + attr_t id + attr_t sic + attr_t norm + attr_t shape + attr_t asciied + attr_t prefix + attr_t suffix + + attr_t length + attr_t cluster + attr_t pos_type + attr_t sense_type float prob - - len_t length - tag_t cluster - tag_t postype - tag_t supersense + float upper_pc + float title_pc cdef Lexeme EMPTY_LEXEME -cpdef Lexeme init(id_t i, unicode string, hash_t hashed, - StringStore store, dict props) except * + +cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store, + dict props) except * -cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil: +cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) + + +cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 64eb699a6..888edc07b 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -6,67 +6,59 @@ from libc.string cimport memset import orth -from .utf8string cimport Utf8Str - -OOV_DIST_FLAGS = 0 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme)) -def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc): - cdef flag_t flags = 0 - flags |= orth.is_alpha(string) << IS_ALPHA - flags |= orth.is_ascii(string) << IS_ASCII - flags |= orth.is_digit(string) << IS_DIGIT - flags |= orth.is_lower(string) << IS_LOWER - flags |= orth.is_punct(string) << IS_PUNCT - flags |= orth.is_space(string) << IS_SPACE - flags |= orth.is_title(string) << IS_TITLE - flags |= orth.is_upper(string) << IS_UPPER - - flags |= orth.like_url(string) << LIKE_URL - flags |= orth.like_number(string) << LIKE_NUMBER - return flags - - cpdef Lexeme init(id_t i, unicode string, hash_t hashed, - StringStore store, dict props) except *: + StringStore string_store, dict props) except *: cdef Lexeme lex lex.id = i lex.length = len(string) - lex.sic = get_string_id(string, store) + lex.sic = string_store[string] lex.cluster = props.get('cluster', 0) - lex.postype = props.get('postype', 0) - lex.supersense = props.get('supersense', 0) + lex.pos_type = props.get('pos_type', 0) + lex.sense_type = props.get('sense_type', 0) lex.prob = props.get('prob', 0) - cdef float upper_pc = props.get('upper_pc', 0.0) - cdef float lower_pc = props.get('lower_pc', 0.0) - cdef float title_pc = props.get('title_pc', 0.0) + lex.upper_pc = props.get('upper_pc', 0.0) + lex.title_pc = props.get('lower_pc', 0.0) - lex.prefix = get_string_id(string[0], store) - lex.suffix = get_string_id(string[-3:], store) - if upper_pc or lower_pc or title_pc: - canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc) - lex.norm = get_string_id(canon_cased, store) - else: - lex.norm = lex.sic - lex.shape = get_string_id(orth.word_shape(string), store) - lex.asciied = get_string_id(orth.asciied(string), store) - lex.flags = get_flags(string, upper_pc, title_pc, lower_pc) - - lex.flags |= props.get('in_males', 0) << IN_MALES - lex.flags |= props.get('in_females', 0) << IN_FEMALES - lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES - lex.flags |= props.get('in_places', 0) << IN_PLACES - lex.flags |= props.get('in_celebs', 0) << IN_CELEBS - lex.flags |= props.get('in_games', 0) << IN_GAMES - lex.flags |= props.get('in_names', 0) << IN_NAMES + lex.prefix = string_store[string[:1]] + lex.suffix = string_store[string[-3:]] + lex.norm = lex.sic # TODO + lex.shape = string_store[orth.word_shape(string)] + lex.asciied = string_store[orth.asciied(string)] + + lex.flags = props.get('flags', 0) return lex -cdef id_t get_string_id(unicode string, StringStore store) except 0: - cdef bytes byte_string = string.encode('utf8') - cdef Utf8Str* orig_str = store.intern(byte_string, len(byte_string)) - return orig_str.i +cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name): + if feat_name < (sizeof(flags_t) * 8): + return check_flag(lex, feat_name) + elif feat_name == ID: + return lex.id + elif feat_name == SIC: + return lex.sic + elif feat_name == NORM: + return lex.norm + elif feat_name == SHAPE: + return lex.shape + elif feat_name == ASCIIED: + return lex.asciied + elif feat_name == PREFIX: + return lex.prefix + elif feat_name == SUFFIX: + return lex.suffix + elif feat_name == LENGTH: + return lex.length + elif feat_name == CLUSTER: + return lex.cluster + elif feat_name == POS_TYPE: + return lex.pos_type + elif feat_name == SENSE_TYPE: + return lex.sense_type + else: + raise StandardError('Feature ID: %d not found' % feat_name) From e170faf5b0d949dde0beb35e02e790d9678f8b67 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Dec 2014 11:05:15 +1100 Subject: [PATCH 10/56] * Hack Tokens to work without tagger.pyx --- spacy/tokens.pxd | 7 +++---- spacy/tokens.pyx | 13 +++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 36dee698e..2c97a3163 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -4,9 +4,8 @@ cimport numpy as np from cymem.cymem cimport Pool from .lexeme cimport Lexeme -from .typedefs cimport flag_t +from .typedefs cimport flags_t from .utf8string cimport StringStore -from .tagger cimport TagType from thinc.typedefs cimport atom_t @@ -29,7 +28,7 @@ cdef class Tokens: cdef int extend(self, int i, Lexeme** lexemes, int n) except -1 cdef int push_back(self, int i, Lexeme* lexeme) except -1 - cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1 + cpdef int set_tag(self, int i, int tag_type, int tag) except -1 cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features) @@ -56,4 +55,4 @@ cdef class Token: cdef public float prob - cdef public flag_t flags + cdef public flags_t flags diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index ba8812f2e..e8e016944 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,7 +1,9 @@ # cython: profile=True from .lexeme cimport * cimport cython -from .tagger cimport POS, ENTITY + +POS = 0 +ENTITY = 0 DEF PADDING = 5 @@ -96,7 +98,7 @@ cdef class Tokens: idx = self.push_back(idx, lexemes[i]) return idx - cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1: + cpdef int set_tag(self, int i, int tag_type, int tag) except -1: if tag_type == POS: self.pos[i] = tag elif tag_type == ENTITY: @@ -108,8 +110,7 @@ cdef class Tokens: output = np.ndarray(shape=(self.length, len(features)), dtype=int) for i in range(self.length): for j, feature in enumerate(features): - output[i, j] = self.lex[i].sic - #output[i, j] = lexeme_get_feature(self.lex[i], feature) + output[i, j] = get_attr(self.lex[i], feature) return output def _realloc(self, new_size): @@ -140,8 +141,8 @@ cdef class Token: self.cluster = lex['cluster'] self.length = lex['length'] - self.postype = lex['postype'] - self.sensetype = lex['supersense'] + self.postype = lex['pos_type'] + self.sensetype = lex['sense_type'] self.sic = lex['sic'] self.norm = lex['norm'] self.shape = lex['shape'] From e600f7b3275a6c976dcc67ff6f3fddf07a241777 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Dec 2014 11:06:00 +1100 Subject: [PATCH 11/56] * Move String struct stuff into the utf8string module, from spacy.lang --- spacy/utf8string.pxd | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd index 16488b899..6bd5c6757 100644 --- a/spacy/utf8string.pxd +++ b/spacy/utf8string.pxd @@ -1,5 +1,6 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool +from murmurhash.mrmr cimport hash64 from .typedefs cimport utf8_t, id_t, hash_t @@ -11,6 +12,18 @@ cdef struct Utf8Str: int length +cdef struct UniStr: + Py_UNICODE* chars + size_t n + hash_t key + + +cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil: + s.chars = &chars[start] + s.n = end - start + s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) + + cdef class StringStore: cdef Pool mem cdef PreshMap _map From 4560ada85b6cec7e6828c8b2669f300f87f9b9be Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Dec 2014 11:06:31 +1100 Subject: [PATCH 12/56] * Add typedef for attr_t. Change flag_t to flags_t --- spacy/typedefs.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 21818f05e..893865133 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -2,7 +2,8 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t ctypedef uint64_t hash_t ctypedef char* utf8_t -ctypedef uint64_t flag_t +ctypedef uint32_t attr_t +ctypedef uint64_t flags_t ctypedef uint32_t id_t ctypedef uint16_t len_t ctypedef uint16_t tag_t From d0d812c548f09ded9436ff584b9615cc5727eb42 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Dec 2014 11:06:57 +1100 Subject: [PATCH 13/56] * Hack setup.py to exclude tagger stuff --- setup.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/setup.py b/setup.py index 10ba5b1ae..ae6d5a99d 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,8 @@ import os.path from os import path from glob import glob +import numpy + def clean(ext): for pyx in ext.sources: @@ -34,7 +36,7 @@ compile_args = [] link_args = [] libs = [] -includes = ['.'] +includes = ['.', numpy.get_include()] cython_includes = ['.'] @@ -50,18 +52,18 @@ exts = [ Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes), Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), - Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes), - Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes), Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes), - Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes), ] From d70d31aa45a4bafa868424c84867b7c187aae71f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Dec 2014 15:44:25 +1100 Subject: [PATCH 14/56] * Introduce first attempt at const-ness --- spacy/lang.pxd | 4 ++-- spacy/lang.pyx | 40 +++++++++++++++++++++------------------- spacy/lexeme.pxd | 4 ++-- spacy/lexeme.pyx | 2 +- spacy/tokens.pxd | 8 ++++---- spacy/tokens.pyx | 8 ++++---- 6 files changed, 34 insertions(+), 32 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 9e4bc7b5d..d4b587a6b 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -18,7 +18,7 @@ cdef class Lexicon: cpdef readonly StringStore strings cdef vector[Lexeme*] lexemes - cdef Lexeme* get(self, UniStr* s) except NULL + cdef const Lexeme* get(self, UniStr* s) except NULL cdef PreshMap _map @@ -45,5 +45,5 @@ cdef class Language: cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 - cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1 + cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 2a284b9df..8d4ea7802 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -37,11 +37,12 @@ cdef class Language: self._suffix_re = re.compile(suffix) self._infix_re = re.compile(infix) self.lexicon = Lexicon(self.set_flags) - if path.exists(path.join(util.DATA_DIR, name, 'lexemes')): - self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) - self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) self._load_special_tokenization(rules) + def load(self): + self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes')) + self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings')) + cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) cdef Tokens tokens = Tokens(self.lexicon.strings, length) @@ -84,7 +85,7 @@ cdef class Language: if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if start < i: slice_unicode(&span, chars, start, i) - lexemes = self._cache.get(span.key) + lexemes = self._cache.get(span.key) if lexemes != NULL: tokens.extend(start, lexemes, 0) else: @@ -96,7 +97,7 @@ cdef class Language: i += 1 if start < i: slice_unicode(&span, chars, start, i) - lexemes = self._cache.get(span.key) + lexemes = self._cache.get(span.key) if lexemes != NULL: tokens.extend(start, lexemes, 0) else: @@ -114,8 +115,8 @@ cdef class Language: self._attach_tokens(tokens, start, span, &prefixes, &suffixes) self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) - cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, - vector[Lexeme*] *suffixes) except NULL: + cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes, + vector[const Lexeme*] *suffixes) except NULL: cdef size_t i cdef UniStr prefix cdef UniStr suffix @@ -158,17 +159,17 @@ cdef class Language: cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, - vector[Lexeme*] *prefixes, - vector[Lexeme*] *suffixes) except -1: + vector[const Lexeme*] *prefixes, + vector[const Lexeme*] *suffixes) except -1: cdef int split - cdef Lexeme** lexemes + cdef const Lexeme* const* lexemes cdef Lexeme* lexeme cdef UniStr span if prefixes.size(): idx = tokens.extend(idx, prefixes.data(), prefixes.size()) if string.n != 0: - lexemes = self._cache.get(string.key) + lexemes = self._cache.get(string.key) if lexemes != NULL: idx = tokens.extend(idx, lexemes, 0) else: @@ -182,13 +183,13 @@ cdef class Language: idx = tokens.push_back(idx, self.lexicon.get(&span)) slice_unicode(&span, string.chars, split + 1, string.n) idx = tokens.push_back(idx, self.lexicon.get(&span)) - cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin() + cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): idx = tokens.push_back(idx, deref(it)) preinc(it) - cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1: - lexemes = self.mem.alloc(n + 1, sizeof(Lexeme**)) + cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1: + lexemes = self.mem.alloc(n + 1, sizeof(Lexeme**)) cdef int i for i in range(n): lexemes[i] = tokens[i] @@ -249,7 +250,7 @@ cdef class Lexicon: self.size = 1 self.set_flags = set_flags - cdef Lexeme* get(self, UniStr* string) except NULL: + cdef const Lexeme* get(self, UniStr* string) except NULL: '''Retrieve a pointer to a Lexeme from the lexicon.''' cdef Lexeme* lex lex = self._map.get(string.key) @@ -289,14 +290,14 @@ cdef class Lexicon: return self.lexemes.at(id_or_string)[0] cdef UniStr string slice_unicode(&string, id_or_string, 0, len(id_or_string)) - cdef Lexeme* lexeme = self.get(&string) + cdef const Lexeme* lexeme = self.get(&string) return lexeme[0] def __setitem__(self, unicode uni_string, dict props): cdef UniStr s slice_unicode(&s, uni_string, 0, len(uni_string)) - cdef Lexeme* lex = self.get(&s) - lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) + cdef const Lexeme* lex = self.get(&s) + self.lexemes[lex.id][0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) def dump(self, loc): if path.exists(loc): @@ -319,7 +320,8 @@ cdef class Lexicon: assert st == 0 def load(self, loc): - assert path.exists(loc) + if not path.exists(loc): + raise IOError('Lexemes file not found at %s' % loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef FILE* fp = fopen(bytes_loc, 'rb') assert fp != NULL diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 9d5dddd6d..a998aeedb 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -112,8 +112,8 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store, dict props) except * -cdef inline bint check_flag(Lexeme* lexeme, attr_id_t flag_id) nogil: +cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) -cdef attr_t get_attr(Lexeme* lex, attr_id_t attr_id) +cdef attr_t get_attr(const Lexeme* lex, attr_id_t attr_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 888edc07b..2090ece50 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -35,7 +35,7 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed, return lex -cdef attr_t get_attr(Lexeme* lex, attr_id_t feat_name): +cdef attr_t get_attr(const Lexeme* lex, attr_id_t feat_name): if feat_name < (sizeof(flags_t) * 8): return check_flag(lex, feat_name) elif feat_name == ID: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 2c97a3163..f91aa16ba 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -14,11 +14,11 @@ cdef class Tokens: cdef Pool mem cdef StringStore _string_store - cdef Lexeme** _lex_ptr + cdef const Lexeme** _lex_ptr cdef int* _idx_ptr cdef int* _pos_ptr cdef int* _ner_ptr - cdef Lexeme** lex + cdef const Lexeme** lex cdef int* idx cdef int* pos cdef int* ner @@ -26,8 +26,8 @@ cdef class Tokens: cdef int length cdef int max_length - cdef int extend(self, int i, Lexeme** lexemes, int n) except -1 - cdef int push_back(self, int i, Lexeme* lexeme) except -1 + cdef int extend(self, int i, const Lexeme* const* lexemes, int n) except -1 + cdef int push_back(self, int i, const Lexeme* lexeme) except -1 cpdef int set_tag(self, int i, int tag_type, int tag) except -1 cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index e8e016944..7f79dcda9 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -44,7 +44,7 @@ cdef class Tokens: # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # However, we need to remember the true starting places, so that we can # realloc. - self._lex_ptr = self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*)) + self._lex_ptr = self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*)) self._idx_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) self._pos_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) self._ner_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) @@ -74,7 +74,7 @@ cdef class Tokens: def __len__(self): return self.length - cdef int push_back(self, int idx, Lexeme* lexeme) except -1: + cdef int push_back(self, int idx, const Lexeme* lexeme) except -1: if self.length == self.max_length: self._realloc(self.length * 2) self.lex[self.length] = lexeme @@ -84,7 +84,7 @@ cdef class Tokens: self.length += 1 return idx + lexeme.length - cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1: + cdef int extend(self, int idx, const Lexeme* const* lexemes, int n) except -1: cdef int i if lexemes == NULL: return idx @@ -116,7 +116,7 @@ cdef class Tokens: def _realloc(self, new_size): self.max_length = new_size n = new_size + (PADDING * 2) - self._lex_ptr = self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*)) + self._lex_ptr = self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*)) self._idx_ptr = self.mem.realloc(self._idx_ptr, n * sizeof(int)) self._pos_ptr = self.mem.realloc(self._pos_ptr, n * sizeof(int)) self._ner_ptr = self.mem.realloc(self._ner_ptr, n * sizeof(int)) From 7e04c22f8feb2f71bb417ef87887a5654496a09e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Dec 2014 15:58:17 +1100 Subject: [PATCH 15/56] * const added to Lexicon interface. Seems to work. --- spacy/lang.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 8d4ea7802..9013de9d0 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -296,8 +296,10 @@ cdef class Lexicon: def __setitem__(self, unicode uni_string, dict props): cdef UniStr s slice_unicode(&s, uni_string, 0, len(uni_string)) - cdef const Lexeme* lex = self.get(&s) - self.lexemes[lex.id][0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) + # Cast through the const here, since we're allowed to change our own + # Lexemes. + lex = self.get(&s) + lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) def dump(self, loc): if path.exists(loc): From d7952634cae783948b7f81543c37ec583a9a9d67 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 3 Dec 2014 16:01:47 +1100 Subject: [PATCH 16/56] * Make the string-store serve const pointers to Utf8Str --- spacy/utf8string.pxd | 2 +- spacy/utf8string.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd index 6bd5c6757..5ef4113d5 100644 --- a/spacy/utf8string.pxd +++ b/spacy/utf8string.pxd @@ -31,4 +31,4 @@ cdef class StringStore: cdef int size cdef int _resize_at - cdef Utf8Str* intern(self, char* chars, int length) except NULL + cdef const Utf8Str* intern(self, char* chars, int length) except NULL diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx index 0384a150c..1d2b7a264 100644 --- a/spacy/utf8string.pyx +++ b/spacy/utf8string.pyx @@ -20,7 +20,7 @@ cdef class StringStore: def __getitem__(self, object string_or_id): cdef bytes byte_string - cdef Utf8Str* utf8str + cdef const Utf8Str* utf8str if isinstance(string_or_id, int) or isinstance(string_or_id, long): if string_or_id < 1 or string_or_id >= self.size: raise IndexError(string_or_id) @@ -36,7 +36,7 @@ cdef class StringStore: else: raise TypeError(type(string_or_id)) - cdef Utf8Str* intern(self, char* chars, int length) except NULL: + cdef const Utf8Str* intern(self, char* chars, int length) except NULL: # 0 means missing, but we don't bother offsetting the index. We waste # slot 0 to simplify the code, because it doesn't matter. assert length != 0 From e1b1f45cc942d59bf68513176669f9021695bae7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 4 Dec 2014 20:46:20 +1100 Subject: [PATCH 17/56] * Add STEM attribute to lexeme --- spacy/lexeme.pxd | 36 +++++++++++++++++++++++++++++++++--- spacy/lexeme.pyx | 32 ++------------------------------ 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index a998aeedb..ef0e8fb12 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -72,7 +72,8 @@ cpdef enum attr_id_t: ID SIC - NORM + STEM + DENSE SHAPE ASCIIED PREFIX @@ -89,7 +90,8 @@ cdef struct Lexeme: attr_t id attr_t sic - attr_t norm + attr_t stem + attr_t dense attr_t shape attr_t asciied attr_t prefix @@ -116,4 +118,32 @@ cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) -cdef attr_t get_attr(const Lexeme* lex, attr_id_t attr_id) +cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil: + if feat_name < (sizeof(flags_t) * 8): + return check_flag(lex, feat_name) + elif feat_name == ID: + return lex.id + elif feat_name == SIC: + return lex.sic + elif feat_name == DENSE: + return lex.dense + elif feat_name == STEM: + return lex.stem + elif feat_name == SHAPE: + return lex.shape + elif feat_name == ASCIIED: + return lex.asciied + elif feat_name == PREFIX: + return lex.prefix + elif feat_name == SUFFIX: + return lex.suffix + elif feat_name == LENGTH: + return lex.length + elif feat_name == CLUSTER: + return lex.cluster + elif feat_name == POS_TYPE: + return lex.pos_type + elif feat_name == SENSE_TYPE: + return lex.sense_type + else: + return 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 2090ece50..5c8d7a60e 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -27,38 +27,10 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed, lex.prefix = string_store[string[:1]] lex.suffix = string_store[string[-3:]] - lex.norm = lex.sic # TODO lex.shape = string_store[orth.word_shape(string)] + lex.dense = lex.sic if lex.prob >= -10 else lex.shape + lex.stem = string_store[props.get('stem', string)] lex.asciied = string_store[orth.asciied(string)] lex.flags = props.get('flags', 0) return lex - - -cdef attr_t get_attr(const Lexeme* lex, attr_id_t feat_name): - if feat_name < (sizeof(flags_t) * 8): - return check_flag(lex, feat_name) - elif feat_name == ID: - return lex.id - elif feat_name == SIC: - return lex.sic - elif feat_name == NORM: - return lex.norm - elif feat_name == SHAPE: - return lex.shape - elif feat_name == ASCIIED: - return lex.asciied - elif feat_name == PREFIX: - return lex.prefix - elif feat_name == SUFFIX: - return lex.suffix - elif feat_name == LENGTH: - return lex.length - elif feat_name == CLUSTER: - return lex.cluster - elif feat_name == POS_TYPE: - return lex.pos_type - elif feat_name == SENSE_TYPE: - return lex.sense_type - else: - raise StandardError('Feature ID: %d not found' % feat_name) From 69bb0222041f8d43febc7648c04b903804a6b299 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 4 Dec 2014 20:46:55 +1100 Subject: [PATCH 18/56] * Add as_array and count_by method --- spacy/tokens.pxd | 5 ++--- spacy/tokens.pyx | 39 ++++++++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index f91aa16ba..90356b74e 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -2,13 +2,12 @@ import numpy as np cimport numpy as np from cymem.cymem cimport Pool +from thinc.typedefs cimport atom_t from .lexeme cimport Lexeme from .typedefs cimport flags_t from .utf8string cimport StringStore -from thinc.typedefs cimport atom_t - cdef class Tokens: cdef Pool mem @@ -30,7 +29,7 @@ cdef class Tokens: cdef int push_back(self, int i, const Lexeme* lexeme) except -1 cpdef int set_tag(self, int i, int tag_type, int tag) except -1 - cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features) + cpdef np.ndarray[long, ndim=2] get_array(self, list features) cdef class Token: diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 7f79dcda9..7fdfa8e1e 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,7 +1,13 @@ # cython: profile=True +from preshed.maps cimport PreshMap +from preshed.counter cimport PreshCounter + from .lexeme cimport * cimport cython +import numpy as np +cimport numpy as np + POS = 0 ENTITY = 0 @@ -19,20 +25,10 @@ cdef class Tokens: """A sequence of references to Lexeme objects. The Tokens class provides fast and memory-efficient access to lexical features, - and can efficiently export the data to a numpy array. Specific languages - create their own Tokens subclasses, to provide more convenient access to - language-specific features. + and can efficiently export the data to a numpy array. >>> from spacy.en import EN >>> tokens = EN.tokenize('An example sentence.') - >>> tokens.string(0) - 'An' - >>> tokens.prob(0) > tokens.prob(1) - True - >>> tokens.can_noun(0) - False - >>> tokens.can_noun(1) - True """ def __init__(self, StringStore string_store, string_length=0): self._string_store = string_store @@ -104,15 +100,28 @@ cdef class Tokens: elif tag_type == ENTITY: self.ner[i] = tag - cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features): + @cython.boundscheck(False) + cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids): cdef int i, j - cdef np.ndarray[atom_t, ndim=2] output - output = np.ndarray(shape=(self.length, len(features)), dtype=int) + cdef attr_id_t feature + cdef np.ndarray[long, ndim=2] output + output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int) for i in range(self.length): - for j, feature in enumerate(features): + for j, feature in enumerate(attr_ids): output[i, j] = get_attr(self.lex[i], feature) return output + def count_by(self, attr_id_t attr_id): + cdef int i + cdef attr_t attr + cdef size_t count + + cdef PreshCounter counts = PreshCounter(2 ** 8) + for i in range(self.length): + attr = get_attr(self.lex[i], attr_id) + counts.inc(attr, 1) + return dict(counts) + def _realloc(self, new_size): self.max_length = new_size n = new_size + (PADDING * 2) From 564082e48e91efcf5f6dcb30b27b2252a7364ab2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 4 Dec 2014 20:51:29 +1100 Subject: [PATCH 19/56] * Hack Token class to take lex.dense inplace of the old lex.norm. This needs to be fixed... --- spacy/tokens.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 7fdfa8e1e..c06a1b4d8 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -153,7 +153,7 @@ cdef class Token: self.postype = lex['pos_type'] self.sensetype = lex['sense_type'] self.sic = lex['sic'] - self.norm = lex['norm'] + self.norm = lex['dense'] self.shape = lex['shape'] self.suffix = lex['asciied'] self.prefix = lex['prefix'] From 49f3780ff5fc34343fe40ab0ad0f8e0b44b54ca9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 4 Dec 2014 21:22:38 +1100 Subject: [PATCH 20/56] * Fiddle with lexeme attrs --- spacy/lexeme.pxd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index ef0e8fb12..e35bde61e 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -103,8 +103,8 @@ cdef struct Lexeme: attr_t sense_type float prob - float upper_pc - float title_pc + float lower_pc + float sentiment cdef Lexeme EMPTY_LEXEME From a14f9eaf6355e28dcb4ba8180831e0470dcb2388 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 4 Dec 2014 22:14:11 +1100 Subject: [PATCH 21/56] * Add index.pyx to setup --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index ae6d5a99d..35c411d38 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ exts = [ Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes), Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes), + Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes) #Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes), #Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes), #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), From 75b8dfb3484e89be88dd10fa9ab7ebdedf949efe Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 4 Dec 2014 22:14:34 +1100 Subject: [PATCH 22/56] * Remove upper_pc from lexeme.pyx --- spacy/lexeme.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 5c8d7a60e..cd92c4845 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -22,8 +22,7 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed, lex.sense_type = props.get('sense_type', 0) lex.prob = props.get('prob', 0) - lex.upper_pc = props.get('upper_pc', 0.0) - lex.title_pc = props.get('lower_pc', 0.0) + lex.lower_pc = props.get('lower_pc', 0.0) lex.prefix = string_store[string[:1]] lex.suffix = string_store[string[-3:]] From 187372c7f338e23b29994956c434da48c6af467c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 5 Dec 2014 03:29:50 +1100 Subject: [PATCH 23/56] * Allow the lexicon to create lexemes using an external memory pool, so that it can decide to make some lexemes temporary, rather than cached --- spacy/lang.pxd | 2 +- spacy/lang.pyx | 55 +++++++++++++++++++++++++++++--------------------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index d4b587a6b..d27378816 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -18,7 +18,7 @@ cdef class Lexicon: cpdef readonly StringStore strings cdef vector[Lexeme*] lexemes - cdef const Lexeme* get(self, UniStr* s) except NULL + cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL cdef PreshMap _map diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 9013de9d0..100b51a98 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -18,6 +18,7 @@ from preshed.maps cimport PreshMap from .lexeme cimport Lexeme from .lexeme cimport EMPTY_LEXEME from .lexeme cimport init as lexeme_init +from .lexeme cimport check_flag, IS_ALPHA from .utf8string cimport slice_unicode @@ -53,7 +54,7 @@ cdef class Language: cdef int idx = 0 for i, py_string in enumerate(strings): slice_unicode(&string_struct, py_string, 0, len(py_string)) - tokens.push_back(idx, self.lexicon.get(&string_struct)) + tokens.push_back(idx, self.lexicon.get(tokens.mem, &string_struct)) idx += len(py_string) + 1 return tokens @@ -132,7 +133,7 @@ cdef class Language: # Check whether we've hit a special-case if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL: string[0] = minus_pre - prefixes.push_back(self.lexicon.get(&prefix)) + prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix)) break suf_len = self._find_suffix(string.chars, string.n) if suf_len != 0: @@ -141,18 +142,18 @@ cdef class Language: # Check whether we've hit a special-case if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL: string[0] = minus_suf - suffixes.push_back(self.lexicon.get(&suffix)) + suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= string.n: slice_unicode(string, string.chars, pre_len, string.n - suf_len) - prefixes.push_back(self.lexicon.get(&prefix)) - suffixes.push_back(self.lexicon.get(&suffix)) + prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix)) + suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix)) elif pre_len: string[0] = minus_pre - prefixes.push_back(self.lexicon.get(&prefix)) + prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix)) elif suf_len: string[0] = minus_suf - suffixes.push_back(self.lexicon.get(&suffix)) + suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix)) if self._specials.get(string.key): break return string @@ -175,22 +176,25 @@ cdef class Language: else: split = self._find_infix(string.chars, string.n) if split == 0 or split == -1: - idx = tokens.push_back(idx, self.lexicon.get(string)) + idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, string)) else: slice_unicode(&span, string.chars, 0, split) - idx = tokens.push_back(idx, self.lexicon.get(&span)) + idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span)) slice_unicode(&span, string.chars, split, split+1) - idx = tokens.push_back(idx, self.lexicon.get(&span)) + idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span)) slice_unicode(&span, string.chars, split + 1, string.n) - idx = tokens.push_back(idx, self.lexicon.get(&span)) + idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span)) cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): idx = tokens.push_back(idx, deref(it)) preinc(it) cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1: - lexemes = self.mem.alloc(n + 1, sizeof(Lexeme**)) cdef int i + for i in range(n): + if tokens[i].id == 1: + return 0 + lexemes = self.mem.alloc(n + 1, sizeof(Lexeme**)) for i in range(n): lexemes[i] = tokens[i] lexemes[i + 1] = NULL @@ -230,7 +234,7 @@ cdef class Language: lexemes = self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) for i, substring in enumerate(substrings): slice_unicode(&string, substring, 0, len(substring)) - lexemes[i] = self.lexicon.get(&string) + lexemes[i] = self.lexicon.get(self.lexicon.mem, &string) lexemes[i + 1] = NULL slice_unicode(&string, uni_string, 0, len(uni_string)) self._specials.set(string.key, lexemes) @@ -247,23 +251,28 @@ cdef class Lexicon: self._map = PreshMap(2 ** 20) self.strings = StringStore() self.lexemes.push_back(&EMPTY_LEXEME) - self.size = 1 + self.size = 2 self.set_flags = set_flags - cdef const Lexeme* get(self, UniStr* string) except NULL: + cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: '''Retrieve a pointer to a Lexeme from the lexicon.''' cdef Lexeme* lex lex = self._map.get(string.key) if lex != NULL: return lex - lex = self.mem.alloc(sizeof(Lexeme), 1) + if string.n < 3: + mem = self.mem + lex = mem.alloc(sizeof(Lexeme), 1) lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {'flags': self.set_flags(string.chars[:string.n])}) - self._map.set(string.key, lex) - while self.lexemes.size() < (lex.id + 1): - self.lexemes.push_back(&EMPTY_LEXEME) - self.lexemes[lex.id] = lex - self.size += 1 + if mem is self.mem: + self._map.set(string.key, lex) + while self.lexemes.size() < (lex.id + 1): + self.lexemes.push_back(&EMPTY_LEXEME) + self.lexemes[lex.id] = lex + self.size += 1 + else: + lex[0].id = 1 return lex def __getitem__(self, id_or_string): @@ -290,7 +299,7 @@ cdef class Lexicon: return self.lexemes.at(id_or_string)[0] cdef UniStr string slice_unicode(&string, id_or_string, 0, len(id_or_string)) - cdef const Lexeme* lexeme = self.get(&string) + cdef const Lexeme* lexeme = self.get(self.mem, &string) return lexeme[0] def __setitem__(self, unicode uni_string, dict props): @@ -298,7 +307,7 @@ cdef class Lexicon: slice_unicode(&s, uni_string, 0, len(uni_string)) # Cast through the const here, since we're allowed to change our own # Lexemes. - lex = self.get(&s) + lex = self.get(self.mem, &s) lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) def dump(self, loc): From 1c9253701daeeafac166608204c44a2db0e9e1fa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 5 Dec 2014 15:56:14 +1100 Subject: [PATCH 24/56] * Introduce a TokenC struct, to handle token indices, pos tags and sense tags --- spacy/lang.pxd | 4 ++-- spacy/lang.pyx | 14 ++++++++------ spacy/tokens.pxd | 20 ++++++++++++-------- spacy/tokens.pyx | 46 ++++++++++++++++------------------------------ 4 files changed, 38 insertions(+), 46 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index d27378816..fd4cf6e70 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -6,7 +6,7 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .tokens cimport Tokens +from .tokens cimport Tokens, TokenC from .lexeme cimport Lexeme from .utf8string cimport StringStore, UniStr @@ -45,5 +45,5 @@ cdef class Language: cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 - cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1 + cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 100b51a98..1fdd683f3 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -18,7 +18,7 @@ from preshed.maps cimport PreshMap from .lexeme cimport Lexeme from .lexeme cimport EMPTY_LEXEME from .lexeme cimport init as lexeme_init -from .lexeme cimport check_flag, IS_ALPHA +from .lexeme cimport check_flag from .utf8string cimport slice_unicode @@ -114,7 +114,7 @@ cdef class Language: orig_size = tokens.length self._split_affixes(span, &prefixes, &suffixes) self._attach_tokens(tokens, start, span, &prefixes, &suffixes) - self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) + self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size) cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes, vector[const Lexeme*] *suffixes) except NULL: @@ -189,14 +189,14 @@ cdef class Language: idx = tokens.push_back(idx, deref(it)) preinc(it) - cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1: + cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: cdef int i for i in range(n): - if tokens[i].id == 1: + if tokens[i].lex.id == 1: return 0 lexemes = self.mem.alloc(n + 1, sizeof(Lexeme**)) for i in range(n): - lexemes[i] = tokens[i] + lexemes[i] = tokens[i].lex lexemes[i + 1] = NULL self._cache.set(key, lexemes) @@ -255,7 +255,9 @@ cdef class Lexicon: self.set_flags = set_flags cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: - '''Retrieve a pointer to a Lexeme from the lexicon.''' + '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme + if necessary, using memory acquired from the given pool. If the pool + is the lexicon's own memory, the lexeme is saved in the lexicon.''' cdef Lexeme* lex lex = self._map.get(string.key) if lex != NULL: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 90356b74e..a219c707f 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -9,18 +9,22 @@ from .typedefs cimport flags_t from .utf8string cimport StringStore +cdef struct TokenC: + const Lexeme* lex + int idx + int pos + int sense + + +cdef TokenC EMPTY_TOKEN = TokenC(&EMPTY_LEXEME, 0, 0, 0) + + cdef class Tokens: cdef Pool mem cdef StringStore _string_store - cdef const Lexeme** _lex_ptr - cdef int* _idx_ptr - cdef int* _pos_ptr - cdef int* _ner_ptr - cdef const Lexeme** lex - cdef int* idx - cdef int* pos - cdef int* ner + cdef TokenC* _data + cdef TokenC* data cdef int length cdef int max_length diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index c06a1b4d8..06d3eeb99 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -40,28 +40,18 @@ cdef class Tokens: # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # However, we need to remember the true starting places, so that we can # realloc. - self._lex_ptr = self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*)) - self._idx_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) - self._pos_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) - self._ner_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) - self.lex = self._lex_ptr - self.idx = self._idx_ptr - self.pos = self._pos_ptr - self.ner = self._ner_ptr + self._data = self.mem.alloc(size + (PADDING*2), sizeof(TokenC)) cdef int i for i in range(size + (PADDING*2)): - self.lex[i] = &EMPTY_LEXEME - self.lex += PADDING - self.idx += PADDING - self.pos += PADDING - self.ner += PADDING + self._data[i] = EMPTY_TOKEN + self.data = self._data + PADDING self.max_length = size self.length = 0 def __getitem__(self, i): bounds_check(i, self.length, PADDING) - return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i], - self.lex[i][0]) + return Token(self._string_store, i, self.data[i].idx, self.data[i].pos, + self.data[i].sense, self.data[i].lex[0]) def __iter__(self): for i in range(self.length): @@ -73,10 +63,11 @@ cdef class Tokens: cdef int push_back(self, int idx, const Lexeme* lexeme) except -1: if self.length == self.max_length: self._realloc(self.length * 2) - self.lex[self.length] = lexeme - self.idx[self.length] = idx - self.pos[self.length] = 0 - self.ner[self.length] = 0 + cdef TokenC* t = &self.data[self.length] + t.lex = lexeme + t.idx = idx + t.pos = 0 + t.sense = 0 self.length += 1 return idx + lexeme.length @@ -108,7 +99,7 @@ cdef class Tokens: output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int) for i in range(self.length): for j, feature in enumerate(attr_ids): - output[i, j] = get_attr(self.lex[i], feature) + output[i, j] = get_attr(self.data[i].lex, feature) return output def count_by(self, attr_id_t attr_id): @@ -118,23 +109,18 @@ cdef class Tokens: cdef PreshCounter counts = PreshCounter(2 ** 8) for i in range(self.length): - attr = get_attr(self.lex[i], attr_id) + attr = get_attr(self.data[i].lex, attr_id) counts.inc(attr, 1) return dict(counts) def _realloc(self, new_size): self.max_length = new_size n = new_size + (PADDING * 2) - self._lex_ptr = self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*)) - self._idx_ptr = self.mem.realloc(self._idx_ptr, n * sizeof(int)) - self._pos_ptr = self.mem.realloc(self._pos_ptr, n * sizeof(int)) - self._ner_ptr = self.mem.realloc(self._ner_ptr, n * sizeof(int)) - self.lex = self._lex_ptr + PADDING - self.idx = self._idx_ptr + PADDING - self.pos = self._pos_ptr + PADDING - self.ner = self._ner_ptr + PADDING + self._data = self.mem.realloc(self._data, n * sizeof(TokenC)) + self.data = self._data + PADDING + cdef int i for i in range(self.length, self.max_length + PADDING): - self.lex[i] = &EMPTY_LEXEME + self.data[i] = EMPTY_TOKEN @cython.freelist(64) From e27b912ef98ac974181309b3d3be056ea4c9393b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 5 Dec 2014 16:31:30 +1100 Subject: [PATCH 25/56] * Remove need for confusing _data pointer to be stored on Tokens --- spacy/tokens.pxd | 1 - spacy/tokens.pyx | 11 ++++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index a219c707f..addb1e3e5 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -23,7 +23,6 @@ cdef class Tokens: cdef Pool mem cdef StringStore _string_store - cdef TokenC* _data cdef TokenC* data cdef int length diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 06d3eeb99..b474ff6fb 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -40,11 +40,11 @@ cdef class Tokens: # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # However, we need to remember the true starting places, so that we can # realloc. - self._data = self.mem.alloc(size + (PADDING*2), sizeof(TokenC)) + data_start = self.mem.alloc(size + (PADDING*2), sizeof(TokenC)) cdef int i for i in range(size + (PADDING*2)): - self._data[i] = EMPTY_TOKEN - self.data = self._data + PADDING + data_start[i] = EMPTY_TOKEN + self.data = data_start + PADDING self.max_length = size self.length = 0 @@ -116,8 +116,9 @@ cdef class Tokens: def _realloc(self, new_size): self.max_length = new_size n = new_size + (PADDING * 2) - self._data = self.mem.realloc(self._data, n * sizeof(TokenC)) - self.data = self._data + PADDING + cdef TokenC* data_start = self.data - PADDING + data_start = self.mem.realloc(data_start, n * sizeof(TokenC)) + self.data = data_start + PADDING cdef int i for i in range(self.length, self.max_length + PADDING): self.data[i] = EMPTY_TOKEN From f5c4f2eb52c3618c4dda056c0171b21b1b7a0e63 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 15:28:22 +1100 Subject: [PATCH 26/56] * Revise context, focussing on POS tagging for now --- spacy/context.pxd | 97 ++++++++++++++------------------ spacy/context.pyx | 138 +++++----------------------------------------- 2 files changed, 54 insertions(+), 181 deletions(-) diff --git a/spacy/context.pxd b/spacy/context.pxd index 8f798d347..3dd842b6e 100644 --- a/spacy/context.pxd +++ b/spacy/context.pxd @@ -1,66 +1,49 @@ from thinc.typedefs cimport atom_t -from .typedefs cimport hash_t -from .tokens cimport Tokens -from .lexeme cimport Lexeme +from .tokens cimport TokenC -cdef class Token: - cdef readonly atom_t sic - cdef readonly atom_t cluster - cdef readonly atom_t norm - cdef readonly atom_t shape - cdef readonly atom_t asciied - cdef readonly atom_t prefix - cdef readonly atom_t suffix - cdef readonly atom_t length +cpdef enum: + P2_sic + P2_cluster + P2_shape + P2_prefix + P2_suffix + P2_pos + P2_sense - cdef readonly atom_t postype - cdef readonly atom_t nertype - cdef readonly atom_t sensetype + P1_sic + P1_cluster + P1_shape + P1_prefix + P1_suffix + P1_pos + P1_sense - cdef readonly atom_t is_alpha - cdef readonly atom_t is_ascii - cdef readonly atom_t is_digit - cdef readonly atom_t is_lower - cdef readonly atom_t is_punct - cdef readonly atom_t is_space - cdef readonly atom_t is_title - cdef readonly atom_t is_upper - cdef readonly atom_t like_url - cdef readonly atom_t like_number - cdef readonly atom_t oft_lower - cdef readonly atom_t oft_title - cdef readonly atom_t oft_upper + W_sic + W_cluster + W_shape + W_prefix + W_suffix + W_pos + W_sense - cdef readonly atom_t in_males - cdef readonly atom_t in_females - cdef readonly atom_t in_surnames - cdef readonly atom_t in_places - cdef readonly atom_t in_games - cdef readonly atom_t in_celebs - cdef readonly atom_t in_names + N1_sic + N1_cluster + N1_shape + N1_prefix + N1_suffix + N1_pos + N1_sense - cdef readonly atom_t pos - cdef readonly atom_t sense - cdef readonly atom_t ner + N2_sic + N2_cluster + N2_shape + N2_prefix + N2_suffix + N2_pos + N2_sense + + N_FIELDS -cdef class Slots: - cdef readonly Token P4 - cdef readonly Token P3 - cdef readonly Token P2 - cdef readonly Token P1 - cdef readonly Token N0 - cdef readonly Token N1 - cdef readonly Token N2 - cdef readonly Token N3 - cdef readonly Token N4 - - -cdef int N_FIELDS - - -cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1 - - -cpdef Slots FIELD_IDS +cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1 diff --git a/spacy/context.pyx b/spacy/context.pyx index aeb78ae5c..c81daef2c 100644 --- a/spacy/context.pyx +++ b/spacy/context.pyx @@ -1,126 +1,16 @@ -from murmurhash.mrmr cimport hash64 -from .lexeme cimport * +cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1: + _fill_from_token(&context[P2_sic], &tokens[i-2]) + _fill_from_token(&context[P1_sic], &tokens[i-1]) + _fill_from_token(&context[W_sic], &tokens[i]) + _fill_from_token(&context[N1_sic], &tokens[i+1]) + _fill_from_token(&context[N2_sic], &tokens[i+2]) -cdef class Slots: - def __init__(self): - self.P4 = Token() - self.P3 = Token() - self.P2 = Token() - self.P1 = Token() - self.N0 = Token() - self.N1 = Token() - self.N2 = Token() - self.N3 = Token() - self.N4 = Token() - - -cdef void _number_token(Token t, int* n_fields): - cdef int i = n_fields[0] - t.sic = i; i += 1 - t.cluster = i; i += 1 - t.norm = i; i += 1 - t.shape = i; i += 1 - t.prefix = i; i += 1 - t.suffix = i; i += 1 - t.length = i; i += 1 - - t.postype = i; i += 1 - t.nertype = i; i += 1 - t.sensetype = i; i += 1 - - t.is_alpha = i; i += 1 - t.is_ascii = i; i += 1 - t.is_digit = i; i += 1 - t.is_lower = i; i += 1 - t.is_punct = i; i += 1 - t.is_space = i; i += 1 - t.is_title = i; i += 1 - t.is_upper = i; i += 1 - - t.like_number = i; i += 1 - t.like_url = i; i += 1 - - t.oft_lower = i; i += 1 - t.oft_title = i; i += 1 - t.oft_upper = i; i += 1 - - t.in_males = i; i += 1 - t.in_females = i; i += 1 - t.in_surnames = i; i += 1 - t.in_places = i; i += 1 - t.in_games = i; i += 1 - t.in_celebs = i; i += 1 - t.in_names = i; i += 1 - - t.pos = i; i += 1 - t.sense = i; i += 1 - t.ner = i; i += 1 - - n_fields[0] = i - - -cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner): - c[t.sic] = lex.sic - c[t.cluster] = lex.cluster - c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape - c[t.shape] = lex.shape - c[t.asciied] = lex.asciied - c[t.prefix] = lex.prefix - c[t.suffix] = lex.suffix - c[t.length] = lex.length - - c[t.postype] = lex.postype - c[t.nertype] = 0 - c[t.sensetype] = 0 - - c[t.is_alpha] = lex.flags & (1 << IS_ALPHA) - c[t.is_digit] = lex.flags & (1 << IS_DIGIT) - c[t.is_lower] = lex.flags & (1 << IS_LOWER) - c[t.is_punct] = lex.flags & (1 << IS_PUNCT) - c[t.is_space] = lex.flags & (1 << IS_SPACE) - c[t.is_title] = lex.flags & (1 << IS_TITLE) - c[t.is_upper] = lex.flags & (1 << IS_UPPER) - c[t.like_url] = lex.flags & (1 << LIKE_URL) - c[t.like_number] = lex.flags & (1 << LIKE_NUMBER) - c[t.oft_lower] = lex.flags & (1 << OFT_LOWER) - c[t.oft_title] = lex.flags & (1 << OFT_TITLE) - c[t.oft_upper] = lex.flags & (1 << OFT_UPPER) - - c[t.in_males] = lex.flags & (1 << IN_MALES) - c[t.in_females] = lex.flags & (1 << IN_FEMALES) - c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES) - c[t.in_places] = lex.flags & (1 << IN_PLACES) - c[t.in_games] = lex.flags & (1 << IN_GAMES) - c[t.in_celebs] = lex.flags & (1 << IN_CELEBS) - c[t.in_names] = lex.flags & (1 << IN_NAMES) - - c[t.pos] = pos - c[t.sense] = 0 - c[t.ner] = ner - - -cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1: - _fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4]) - _fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3]) - _fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2]) - _fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1]) - _fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i]) - _fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1]) - _fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2]) - _fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3]) - _fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4]) - return 1 - - -N_FIELDS = 0 -FIELD_IDS = Slots() -_number_token(FIELD_IDS.P4, &N_FIELDS) -_number_token(FIELD_IDS.P3, &N_FIELDS) -_number_token(FIELD_IDS.P2, &N_FIELDS) -_number_token(FIELD_IDS.P1, &N_FIELDS) -_number_token(FIELD_IDS.N0, &N_FIELDS) -_number_token(FIELD_IDS.N1, &N_FIELDS) -_number_token(FIELD_IDS.N2, &N_FIELDS) -_number_token(FIELD_IDS.N3, &N_FIELDS) -_number_token(FIELD_IDS.N4, &N_FIELDS) +cdef inline void _fill_from_token(atom_t[N_FIELDS] context, const TokenC* t) nogil: + context[0] = t.lex.sic + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.pos + context[6] = t.sense From 0c7aeb9de7105e513d38544c15c860764a521bcb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 15:29:04 +1100 Subject: [PATCH 27/56] * Begin revising tagger, focussing on POS tagging --- spacy/tagger.pxd | 11 +----- spacy/tagger.pyx | 100 +++++++++++++++++++++++------------------------ 2 files changed, 50 insertions(+), 61 deletions(-) diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 11d8d2a4c..0a9b4a0c4 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -5,20 +5,17 @@ from thinc.features cimport Extractor from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t from .typedefs cimport hash_t -from .context cimport Slots from .tokens cimport Tokens cpdef enum TagType: POS - ENTITY SENSE cdef class Tagger: cpdef int set_tags(self, Tokens tokens) except -1 - cpdef class_t predict(self, int i, Tokens tokens) except 0 - cpdef int tell_answer(self, list gold) except -1 + cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except 0 cpdef readonly Pool mem cpdef readonly Extractor extractor @@ -26,9 +23,3 @@ cdef class Tagger: cpdef readonly TagType tag_type cpdef readonly list tag_names - - cdef class_t _guess - cdef atom_t* _context - cdef feat_t* _feats - cdef weight_t* _values - cdef weight_t* _scores diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 428814f70..22732843d 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -1,8 +1,10 @@ # cython: profile=True -from __future__ import print_function from __future__ import unicode_literals from __future__ import division +from .context cimport fill_context +from .context cimport N_FIELDS + from os import path import os import shutil @@ -10,11 +12,7 @@ import random import json import cython - -from .context cimport fill_context -from .context cimport N_FIELDS - -from thinc.features cimport ConjFeat +from thinc.features cimport Feature, count_feats NULL_TAG = 0 @@ -35,7 +33,8 @@ def setup_model_dir(tag_type, tag_names, templates, model_dir): def train(train_sents, model_dir, nr_iter=10): cdef Tokens tokens - tagger = Tagger(model_dir) + cdef Tagger tagger = Tagger(model_dir) + cdef int i for _ in range(nr_iter): n_corr = 0 total = 0 @@ -43,9 +42,10 @@ def train(train_sents, model_dir, nr_iter=10): assert len(tokens) == len(golds), [t.string for t in tokens] for i in range(tokens.length): if tagger.tag_type == POS: - gold = _get_gold_pos(i, golds, tokens.pos) - elif tagger.tag_type == ENTITY: - gold = _get_gold_ner(i, golds, tokens.ner) + gold = _get_gold_pos(i, golds) + else: + raise StandardError + guess = tagger.predict(i, tokens) tokens.set_tag(i, tagger.tag_type, guess) if gold is not None: @@ -59,7 +59,7 @@ def train(train_sents, model_dir, nr_iter=10): tagger.model.dump(path.join(model_dir, 'model')) -cdef object _get_gold_pos(i, golds, int* pred): +cdef object _get_gold_pos(i, golds): if golds[i] == 0: return None else: @@ -96,17 +96,11 @@ cdef class Tagger: templates = cfg['templates'] self.tag_names = cfg['tag_names'] self.tag_type = cfg['tag_type'] - self.extractor = Extractor(templates, [ConjFeat] * len(templates)) + self.extractor = Extractor(templates) self.model = LinearModel(len(self.tag_names)) if path.exists(path.join(model_dir, 'model')): self.model.load(path.join(model_dir, 'model')) - self._context = self.mem.alloc(N_FIELDS, sizeof(atom_t)) - self._feats = self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) - self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) - self._scores = self.mem.alloc(self.model.nr_class, sizeof(weight_t)) - self._guess = NULL_TAG - cpdef int set_tags(self, Tokens tokens) except -1: """Assign tags to a Tokens object. @@ -119,7 +113,7 @@ cdef class Tagger: for i in range(tokens.length): tokens.set_tag(i, self.tag_type, self.predict(i, tokens)) - cpdef class_t predict(self, int i, Tokens tokens) except 0: + cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except 0: """Predict the tag of tokens[i]. The tagger remembers the features and prediction, in case you later call tell_answer. @@ -127,38 +121,20 @@ cdef class Tagger: >>> tag = EN.pos_tagger.predict(0, tokens) >>> assert tag == EN.pos_tagger.tag_id('DT') == 5 """ - fill_context(self._context, i, tokens) - self.extractor.extract(self._feats, self._values, self._context, NULL) - self._guess = self.model.score(self._scores, self._feats, self._values) - return self._guess - - cpdef int tell_answer(self, list golds) except -1: - """Provide the correct tag for the word the tagger was last asked to predict. - During Tagger.predict, the tagger remembers the features and prediction - for the example. These are used to calculate a weight update given the - correct label. - - >>> tokens = EN.tokenize('An example sentence.') - >>> guess = EN.pos_tagger.predict(1, tokens) - >>> JJ = EN.pos_tagger.tag_id('JJ') - >>> JJ - 7 - >>> EN.pos_tagger.tell_answer(JJ) - """ - cdef class_t guess = self._guess - if guess in golds: - self.model.update({}) - return 0 - best_gold = golds[0] - best_score = self._scores[best_gold-1] - for gold in golds[1:]: - if self._scores[gold-1] > best_gold: - best_score = self._scores[best_gold-1] - best_gold = gold - counts = {guess: {}, best_gold: {}} - self.extractor.count(counts[best_gold], self._feats, 1) - self.extractor.count(counts[guess], self._feats, -1) - self.model.update(counts) + cdef int n_feats + cdef atom_t[N_FIELDS] context + print sizeof(context) + fill_context(context, i, tokens.data) + cdef Feature* feats = self.extractor.get_feats(context, &n_feats) + cdef weight_t* scores = self.model.get_scores(feats, n_feats) + cdef class_t guess = _arg_max(scores, self.nr_class) + if golds is not None and guess not in golds: + best = _arg_max_among(scores, golds) + counts = {} + count_feats(counts[guess], feats, n_feats, -1) + count_feats(counts[best], feats, n_feats, 1) + self.model.update(counts) + return guess def tag_id(self, object tag_name): """Encode tag_name into a tag ID integer.""" @@ -167,3 +143,25 @@ cdef class Tagger: tag_id = len(self.tag_names) self.tag_names.append(tag_name) return tag_id + + +cdef class_t _arg_max(weight_t* scores, int n_classes): + cdef int best = 0 + cdef weight_t score = scores[best] + cdef int i + for i in range(1, n_classes): + if scores[i] > score: + score = scores[i] + best = i + return best + + +cdef class_t _arg_max_among(weight_t* scores, list classes): + cdef int best = classes[0] + cdef weight_t score = scores[best] + cdef class_t clas + for clas in classes: + if scores[clas] > score: + score = scores[clas] + best = clas + return best From 5caabec789a767a6d9876fc4b79f3c302afceb0a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 15:29:41 +1100 Subject: [PATCH 28/56] * Link in tagger, to work on integrating POS tagging --- spacy/lang.pxd | 2 ++ spacy/lang.pyx | 2 ++ 2 files changed, 4 insertions(+) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index fd4cf6e70..54f317ce8 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -8,6 +8,7 @@ from cymem.cymem cimport Pool from .typedefs cimport hash_t from .tokens cimport Tokens, TokenC from .lexeme cimport Lexeme +from .tagger cimport Tagger from .utf8string cimport StringStore, UniStr @@ -29,6 +30,7 @@ cdef class Language: cdef PreshMap _cache cdef PreshMap _specials cpdef readonly Lexicon lexicon + cpdef readonly Tagger pos_tagger cdef object _prefix_re cdef object _suffix_re diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 1fdd683f3..0ca5f08d2 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -39,10 +39,12 @@ cdef class Language: self._infix_re = re.compile(infix) self.lexicon = Lexicon(self.set_flags) self._load_special_tokenization(rules) + self.pos_tagger = None def load(self): self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes')) self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings')) + self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) From 91e8d9ea1c89da2cc2e339213771d6622c8ad3fc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 15:29:54 +1100 Subject: [PATCH 29/56] * Compile context.pyx and tagger.pyx modules --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 35c411d38..1199ba613 100644 --- a/setup.py +++ b/setup.py @@ -53,9 +53,9 @@ exts = [ Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes), Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes), - Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes) - #Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes), - #Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes), + Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes), + Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes), + Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes), #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes), From 5fe5e6e66b3d3770214264671bf2c46315abd1c1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 21:59:04 +1100 Subject: [PATCH 30/56] * Move context functions to header, inlining them. --- spacy/context.pxd | 17 ++++++++++++++++- spacy/context.pyx | 15 --------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/spacy/context.pxd b/spacy/context.pxd index 3dd842b6e..3c7764846 100644 --- a/spacy/context.pxd +++ b/spacy/context.pxd @@ -46,4 +46,19 @@ cpdef enum: N_FIELDS -cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1 +cdef inline void fill_context(atom_t* context, const int i, const TokenC* tokens) nogil: + _fill_from_token(&context[P2_sic], &tokens[i-2]) + _fill_from_token(&context[P1_sic], &tokens[i-1]) + _fill_from_token(&context[W_sic], &tokens[i]) + _fill_from_token(&context[N1_sic], &tokens[i+1]) + _fill_from_token(&context[N2_sic], &tokens[i+2]) + + +cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: + context[0] = t.lex.sic + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.pos + context[6] = t.sense diff --git a/spacy/context.pyx b/spacy/context.pyx index c81daef2c..8b1378917 100644 --- a/spacy/context.pyx +++ b/spacy/context.pyx @@ -1,16 +1 @@ -cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1: - _fill_from_token(&context[P2_sic], &tokens[i-2]) - _fill_from_token(&context[P1_sic], &tokens[i-1]) - _fill_from_token(&context[W_sic], &tokens[i]) - _fill_from_token(&context[N1_sic], &tokens[i+1]) - _fill_from_token(&context[N2_sic], &tokens[i+2]) - -cdef inline void _fill_from_token(atom_t[N_FIELDS] context, const TokenC* t) nogil: - context[0] = t.lex.sic - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.pos - context[6] = t.sense From 677e111ee7c033cb4cb9a2b1cc41dd282c3a74a8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 22:04:47 +1100 Subject: [PATCH 31/56] * Revise tokenization rules to match PTB. Rules are pretty messy around periods, need better support for these. --- data/en/prefix | 5 ++ data/en/tokenization | 182 ++++++++++++++++++++++++------------------- 2 files changed, 109 insertions(+), 78 deletions(-) diff --git a/data/en/prefix b/data/en/prefix index 64a3f1f2f..cb9bb4d7b 100644 --- a/data/en/prefix +++ b/data/en/prefix @@ -11,3 +11,8 @@ $ ' `` ` +# +US$ +C$ +A$ +a- diff --git a/data/en/tokenization b/data/en/tokenization index 6bf0d738b..e2b78dd28 100644 --- a/data/en/tokenization +++ b/data/en/tokenization @@ -6,99 +6,100 @@ 's 's 'S 'S -ain't are not -aren't are not -can't can not +ain't ai n't +aren't are n't +can't ca n't cannot can not -could've could have -couldn't could not -couldn't've could not have -didn't did not -doesn't does not -don't do not -hadn't had not -hadn't've had not have -hasn't has not -haven't have not -he'd he would -he'd've he would have -he'll he will +could've could 've +couldn't could n't +couldn't've could n't 've +didn't did n't +doesn't does n't +don't do n't +hadn't had n't +hadn't've had n't 've +hasn't has n't +haven't have n't +he'd he 'd +he'd've he 'd 've +he'll he 'll he's he 's -how'd he would -how'll he will +how'd he 'd +how'll he 'll how's how 's -I'd I would -I'd've I would have -I'll I will -I'm I am -I'ma I will -I've I have -isn't is not -it'd it would -it'd've it would have -it'll it will +I'd I 'd +I'd've I 'd 've +I'll I 'll +I'm I 'm +I'ma I 'ma +I've I 've +isn't is n't +it'd it 'd +it'd've it 'd 've +it'll it 'll it's it 's let's let 's -mightn't might not -mightn't've might not have -might've might have -mustn't must not -must've must have -needn't need not -not've not have -shan't shall not -she'd she would -she'd've she would have +mightn't might n't +mightn't've might n't 've +might've might 've +mustn't must n't +must've must 've +needn't need n't +not've not 've +shan't sha n't +she'd she 'd +she'd've she 'd 've she'll she will she's she 's -should've should have -shouldn't should not -shouldn't've should not have +should've should 've +shouldn't should n't +shouldn't've should n't 've that's that 's -there'd there would -there'd've there would have -there's there is -they'd there would -they'd've they would have -they'll they will -they're they are -they've they have -wasn't was not -we'd we would -we'd've we would have -we'll we will -we're we are -we've we have -weren't were not -what'll what will -what're what are +there'd there 'd +there'd've there 'd 've +there's there 's +they'd there 'd +they'd've they 'd 've +they'll they 'll +they're they 're +they've they 've +wasn't was n't +we'd we 'd +we'd've we 'd 've +we'll we 'll +we're we 're +we've we 've +weren't were n't +what'll what 'll +what're what 're what's what 's -what've what have +what've what 've when's when 's -where'd where would +where'd where 'd where's where 's -where've where have -who'd who would -who'll who will -who're who are +where've where 've +who'd who 'd +who'll who 'll +who're who 're who's who 's -who've who have -why'll who will -why're why are +who've who 've +why'll why 'll +why're why 're why's why 's -won't will not -would've would have -wouldn't would not -wouldn't've would not have -you'd you would -you'd've you would have -you'll you will -you're you are -you've you have -'em them -'ol old +won't wo n't +would've would 've +wouldn't would n't +wouldn't've would n't 've +you'd you 'd +you'd've you 'd 've +you'll you 'll +you're you 're +you've you 've +'em 'em +'ol 'ol 10km 10 km U.S. U.S. +U.K. U.K. non-U.S. non-U.S. U.N. U.N. Co. Co. @@ -115,7 +116,12 @@ A.G. A.G. Rep. Rep. Ms. Ms. Mr. Mr. +Mrs. Mrs. a.m. a.m. +Sen. Sen. +INC. INC. +CO. CO. +COS. COS. p.m. p.m. Nos. Nos. a.k.a. a.k.a. @@ -127,6 +133,7 @@ E. E. F. F. G. G. H. H. +I. I. J. J. K. K. L. L. @@ -205,6 +212,9 @@ Wash. Wash. W.Va. W.Va. Wis. Wis. Wyo. Wyo. +L.A. L.A. +R.H. R.H. +Gov. Gov. '' '' :) :) <3 <3 @@ -262,3 +272,19 @@ V_V V_V o.O o.O ") ") .... .... +a- a - +Messrs. Messrs. +No. No. +vs. vs. +Gen. Gen. +Cos. Cos. +L.J. L.J. +D.T. D.T. +Prof. Prof. +Bros. Bros. +J.C. J.C. +Neb. Neb. +Adm. Adm. +U.S.S.R. U.S.S.R. +Rev. Rev. +H.F. H.F. From f00afe12c4af10d49f64d3bbc32734b94eb09df1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 22:05:57 +1100 Subject: [PATCH 32/56] * Load POS tagger in load() function if path exists --- spacy/lang.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 0ca5f08d2..d0f5e6944 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -44,7 +44,8 @@ cdef class Language: def load(self): self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes')) self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings')) - self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) + if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): + self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) From 3819a88e1b313d868c60ab4815fca1c4e5eefd76 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 22:07:16 +1100 Subject: [PATCH 33/56] * Add support for tag dictionary, and fix error-code for predict method --- spacy/tagger.pxd | 4 +++- spacy/tagger.pyx | 55 ++++++++++++++++++++++++++++++------------------ 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 0a9b4a0c4..772086926 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -3,6 +3,7 @@ from cymem.cymem cimport Pool from thinc.learner cimport LinearModel from thinc.features cimport Extractor from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t +from preshed.maps cimport PreshMap from .typedefs cimport hash_t from .tokens cimport Tokens @@ -15,7 +16,7 @@ cpdef enum TagType: cdef class Tagger: cpdef int set_tags(self, Tokens tokens) except -1 - cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except 0 + cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except * cpdef readonly Pool mem cpdef readonly Extractor extractor @@ -23,3 +24,4 @@ cdef class Tagger: cpdef readonly TagType tag_type cpdef readonly list tag_names + cdef dict tagdict diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 22732843d..0ae66a844 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -18,7 +18,7 @@ from thinc.features cimport Feature, count_feats NULL_TAG = 0 -def setup_model_dir(tag_type, tag_names, templates, model_dir): +def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir): if path.exists(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) @@ -26,6 +26,7 @@ def setup_model_dir(tag_type, tag_names, templates, model_dir): 'tag_type': tag_type, 'templates': templates, 'tag_names': tag_names, + 'tag_counts': tag_counts, } with open(path.join(model_dir, 'config.json'), 'w') as file_: json.dump(config, file_) @@ -35,24 +36,19 @@ def train(train_sents, model_dir, nr_iter=10): cdef Tokens tokens cdef Tagger tagger = Tagger(model_dir) cdef int i + cdef class_t guess = 0 + cdef class_t gold for _ in range(nr_iter): n_corr = 0 total = 0 for tokens, golds in train_sents: assert len(tokens) == len(golds), [t.string for t in tokens] for i in range(tokens.length): - if tagger.tag_type == POS: - gold = _get_gold_pos(i, golds) - else: - raise StandardError - - guess = tagger.predict(i, tokens) + gold = golds[i] + guess = tagger.predict(i, tokens, [gold]) tokens.set_tag(i, tagger.tag_type, guess) - if gold is not None: - tagger.tell_answer(gold) - total += 1 - n_corr += guess in gold - #print('%s\t%d\t%d' % (tokens[i].string, guess, gold)) + total += 1 + n_corr += guess == gold print('%.4f' % ((n_corr / total) * 100)) random.shuffle(train_sents) tagger.model.end_training() @@ -96,8 +92,9 @@ cdef class Tagger: templates = cfg['templates'] self.tag_names = cfg['tag_names'] self.tag_type = cfg['tag_type'] + self.tagdict = _make_tag_dict(cfg['tag_counts']) self.extractor = Extractor(templates) - self.model = LinearModel(len(self.tag_names)) + self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2) if path.exists(path.join(model_dir, 'model')): self.model.load(path.join(model_dir, 'model')) @@ -113,7 +110,7 @@ cdef class Tagger: for i in range(tokens.length): tokens.set_tag(i, self.tag_type, self.predict(i, tokens)) - cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except 0: + cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except *: """Predict the tag of tokens[i]. The tagger remembers the features and prediction, in case you later call tell_answer. @@ -121,16 +118,18 @@ cdef class Tagger: >>> tag = EN.pos_tagger.predict(0, tokens) >>> assert tag == EN.pos_tagger.tag_id('DT') == 5 """ - cdef int n_feats + cdef atom_t sic = tokens.data[i].lex.sic + if sic in self.tagdict: + return self.tagdict[sic] cdef atom_t[N_FIELDS] context - print sizeof(context) fill_context(context, i, tokens.data) + cdef int n_feats cdef Feature* feats = self.extractor.get_feats(context, &n_feats) cdef weight_t* scores = self.model.get_scores(feats, n_feats) - cdef class_t guess = _arg_max(scores, self.nr_class) + guess = _arg_max(scores, self.model.nr_class) if golds is not None and guess not in golds: best = _arg_max_among(scores, golds) - counts = {} + counts = {guess: {}, best: {}} count_feats(counts[guess], feats, n_feats, -1) count_feats(counts[best], feats, n_feats, 1) self.model.update(counts) @@ -145,12 +144,28 @@ cdef class Tagger: return tag_id -cdef class_t _arg_max(weight_t* scores, int n_classes): +def _make_tag_dict(counts): + freq_thresh = 50 + ambiguity_thresh = 0.98 + tagdict = {} + cdef atom_t word + cdef atom_t tag + for word_str, tag_freqs in counts.items(): + tag_str, mode = max(tag_freqs.items(), key=lambda item: item[1]) + n = sum(tag_freqs.values()) + word = int(word_str) + tag = int(tag_str) + if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: + tagdict[word] = tag + return tagdict + + +cdef class_t _arg_max(weight_t* scores, int n_classes) except 9000: cdef int best = 0 cdef weight_t score = scores[best] cdef int i for i in range(1, n_classes): - if scores[i] > score: + if scores[i] >= score: score = scores[i] best = i return best From 9f17467c2e1a1d14d775e3da90aa1d668cbe7d15 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 22:07:41 +1100 Subject: [PATCH 34/56] * Fix EMPTY_TOKEN --- spacy/tokens.pxd | 3 --- spacy/tokens.pyx | 14 ++++++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index addb1e3e5..e6bc0a46a 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -16,9 +16,6 @@ cdef struct TokenC: int sense -cdef TokenC EMPTY_TOKEN = TokenC(&EMPTY_LEXEME, 0, 0, 0) - - cdef class Tokens: cdef Pool mem cdef StringStore _string_store diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index b474ff6fb..407ffcb8b 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -43,7 +43,7 @@ cdef class Tokens: data_start = self.mem.alloc(size + (PADDING*2), sizeof(TokenC)) cdef int i for i in range(size + (PADDING*2)): - data_start[i] = EMPTY_TOKEN + data_start[i].lex = &EMPTY_LEXEME self.data = data_start + PADDING self.max_length = size self.length = 0 @@ -86,10 +86,7 @@ cdef class Tokens: return idx cpdef int set_tag(self, int i, int tag_type, int tag) except -1: - if tag_type == POS: - self.pos[i] = tag - elif tag_type == ENTITY: - self.ner[i] = tag + self.data[i].pos = tag @cython.boundscheck(False) cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids): @@ -116,12 +113,17 @@ cdef class Tokens: def _realloc(self, new_size): self.max_length = new_size n = new_size + (PADDING * 2) + # What we're storing is a "padded" array. We've jumped forward PADDING + # places, and are storing the pointer to that. This way, we can access + # words out-of-bounds, and get out-of-bounds markers. + # Now that we want to realloc, we need the address of the true start, + # so we jump the pointer back PADDING places. cdef TokenC* data_start = self.data - PADDING data_start = self.mem.realloc(data_start, n * sizeof(TokenC)) self.data = data_start + PADDING cdef int i for i in range(self.length, self.max_length + PADDING): - self.data[i] = EMPTY_TOKEN + self.data[i].lex = &EMPTY_LEXEME @cython.freelist(64) From 8f2f319c57799f5bd9cad34c6dd4ee29c56e2713 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 22:08:04 +1100 Subject: [PATCH 35/56] * Add a couple more contractions tests --- tests/test_contractions.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_contractions.py b/tests/test_contractions.py index b7347a617..8334a74a9 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -39,3 +39,10 @@ def test_capitalized(): tokens = EN.tokenize("Ain't") assert len(tokens) == 2 assert tokens[0].string == "Are" + + +def test_punct(): + tokens = EN.tokenize("We've") + assert len(tokens) == 2 + tokens = EN.tokenize("``We've") + assert len(tokens) == 3 From 327383e38afb810f6afc3cd185444225f0368074 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 22:14:51 +1100 Subject: [PATCH 36/56] * Remove unused code in tagger.pyx --- spacy/tagger.pyx | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 0ae66a844..04ffef550 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -15,9 +15,6 @@ import cython from thinc.features cimport Feature, count_feats -NULL_TAG = 0 - - def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir): if path.exists(model_dir): shutil.rmtree(model_dir) @@ -55,33 +52,6 @@ def train(train_sents, model_dir, nr_iter=10): tagger.model.dump(path.join(model_dir, 'model')) -cdef object _get_gold_pos(i, golds): - if golds[i] == 0: - return None - else: - return [golds[i]] - - -cdef object _get_gold_ner(i, golds, int* ner): - if golds[i] == 0: - return None - else: - return [golds[i]] - - -def evaluate(tagger, sents): - n_corr = 0 - total = 0 - for tokens, golds in sents: - for i, gold in enumerate(golds): - guess = tagger.predict(i, tokens) - tokens.set_tag(i, tagger.tag_type, guess) - if gold != NULL_TAG: - total += 1 - n_corr += guess == gold - return n_corr / total - - cdef class Tagger: """Assign part-of-speech, named entity or supersense tags, using greedy decoding. The tagger reads its model and configuration from disk. From ef4398b204ab60e4ff3c9bf476a059ad07be282d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 23:52:41 +1100 Subject: [PATCH 37/56] * Rearrange POS stuff, so that language-specific stuff can live in language-specific modules --- setup.py | 1 - spacy/context.pxd | 63 ------------------------------------------ spacy/en.pxd | 70 +++++++++++++++++++++++++++++++++++++++++++++-- spacy/en.pyx | 51 ++++++++++++++++++++++++++++++---- spacy/lang.pxd | 2 +- spacy/lang.pyx | 11 ++++---- spacy/lexeme.pxd | 13 --------- spacy/lexeme.pyx | 7 +---- spacy/tagger.pxd | 10 +------ spacy/tagger.pyx | 49 ++------------------------------- spacy/tokens.pyx | 4 +-- 11 files changed, 127 insertions(+), 154 deletions(-) diff --git a/setup.py b/setup.py index 1199ba613..6ff1f5d62 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,6 @@ exts = [ Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes), Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes), Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes), - Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes), #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes), diff --git a/spacy/context.pxd b/spacy/context.pxd index 3c7764846..8b1378917 100644 --- a/spacy/context.pxd +++ b/spacy/context.pxd @@ -1,64 +1 @@ -from thinc.typedefs cimport atom_t -from .tokens cimport TokenC - -cpdef enum: - P2_sic - P2_cluster - P2_shape - P2_prefix - P2_suffix - P2_pos - P2_sense - - P1_sic - P1_cluster - P1_shape - P1_prefix - P1_suffix - P1_pos - P1_sense - - W_sic - W_cluster - W_shape - W_prefix - W_suffix - W_pos - W_sense - - N1_sic - N1_cluster - N1_shape - N1_prefix - N1_suffix - N1_pos - N1_sense - - N2_sic - N2_cluster - N2_shape - N2_prefix - N2_suffix - N2_pos - N2_sense - - N_FIELDS - - -cdef inline void fill_context(atom_t* context, const int i, const TokenC* tokens) nogil: - _fill_from_token(&context[P2_sic], &tokens[i-2]) - _fill_from_token(&context[P1_sic], &tokens[i-1]) - _fill_from_token(&context[W_sic], &tokens[i]) - _fill_from_token(&context[N1_sic], &tokens[i+1]) - _fill_from_token(&context[N2_sic], &tokens[i+2]) - - -cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: - context[0] = t.lex.sic - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.pos - context[6] = t.sense diff --git a/spacy/en.pxd b/spacy/en.pxd index cccfb60a8..8ce023106 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -1,5 +1,9 @@ -from spacy.lang cimport Language -from spacy.tokens cimport Tokens +from thinc.typedefs cimport atom_t + +from .lang cimport Language +from .tokens cimport Tokens +from .tokens cimport TokenC + # Flags cpdef enum FlagID: @@ -28,5 +32,67 @@ cpdef enum FlagID: IN_NAMES +cpdef enum: + P2_sic + P2_cluster + P2_shape + P2_prefix + P2_suffix + P2_pos + P2_sense + + P1_sic + P1_cluster + P1_shape + P1_prefix + P1_suffix + P1_pos + P1_sense + + W_sic + W_cluster + W_shape + W_prefix + W_suffix + W_pos + W_sense + + N1_sic + N1_cluster + N1_shape + N1_prefix + N1_suffix + N1_pos + N1_sense + + N2_sic + N2_cluster + N2_shape + N2_prefix + N2_suffix + N2_pos + N2_sense + + N_CONTEXT_FIELDS + + +cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil: + _fill_from_token(&context[P2_sic], &tokens[i-2]) + _fill_from_token(&context[P1_sic], &tokens[i-1]) + _fill_from_token(&context[W_sic], &tokens[i]) + _fill_from_token(&context[N1_sic], &tokens[i+1]) + _fill_from_token(&context[N2_sic], &tokens[i+2]) + + +cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: + context[0] = t.lex.sic + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.pos + context[6] = t.sense + + cdef class English(Language): pass diff --git a/spacy/en.pyx b/spacy/en.pyx index 92be97aad..c0eb0368b 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -30,11 +30,6 @@ same scheme. Tokenization problems are a major cause of poor performance for NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module provides a fully Penn Treebank 3-compliant tokenizer. ''' -# TODO -#The script translate_treebank_tokenization can be used to transform a treebank's -#annotation to use one of the spacy tokenization schemes. - - from __future__ import unicode_literals cimport lang @@ -42,6 +37,32 @@ from .typedefs cimport flags_t import orth +POS_TEMPLATES = ( + (W_sic,), + (P1_sic,), + (N1_sic,), + (N2_sic,), + (P2_sic,), + + (W_suffix,), + (W_prefix,), + + (P1_pos,), + (P2_pos,), + (P1_pos, P2_pos), + (P1_pos, W_sic), + (P1_suffix,), + (N1_suffix,), + + (W_shape,), + (W_cluster,), + (N1_cluster,), + (N2_cluster,), + (P1_cluster,), + (P2_cluster,), +) + + cdef class English(Language): """English tokenizer, tightly coupled to lexicon. @@ -49,6 +70,9 @@ cdef class English(Language): name (unicode): The two letter code used by Wikipedia for the language. lexicon (Lexicon): The lexicon. Exposes the lookup method. """ + def get_props(self, unicode string): + return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)} + def set_flags(self, unicode string): cdef flags_t flags = 0 flags |= orth.is_alpha(string) << IS_ALPHA @@ -64,5 +88,22 @@ cdef class English(Language): flags |= orth.like_number(string) << LIKE_NUMBER return flags + def set_pos(self, Tokens tokens): + cdef int i + cdef atom_t[N_CONTEXT_FIELDS] context + for i in range(tokens.length): + fill_pos_context(context, i, tokens.data) + tokens.data[i].pos = self.pos_tagger.predict(context) + + def train_pos(self, Tokens tokens, golds): + cdef int i + cdef atom_t[N_CONTEXT_FIELDS] context + c = 0 + for i in range(tokens.length): + fill_pos_context(context, i, tokens.data) + tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]]) + c += tokens.data[i].pos == golds[i] + return c + EN = English('en') diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 54f317ce8..20374f40d 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -13,7 +13,7 @@ from .utf8string cimport StringStore, UniStr cdef class Lexicon: - cpdef public set_flags + cpdef public get_lex_props cdef Pool mem cpdef readonly size_t size cpdef readonly StringStore strings diff --git a/spacy/lang.pyx b/spacy/lang.pyx index d0f5e6944..496c6742c 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -37,7 +37,7 @@ cdef class Language: self._prefix_re = re.compile(prefix) self._suffix_re = re.compile(suffix) self._infix_re = re.compile(infix) - self.lexicon = Lexicon(self.set_flags) + self.lexicon = Lexicon(self.get_props) self._load_special_tokenization(rules) self.pos_tagger = None @@ -249,13 +249,13 @@ cdef class Lexicon: Also interns UTF-8 strings, and maps them to consecutive integer IDs. ''' - def __init__(self, object set_flags=None): + def __init__(self, object get_props): self.mem = Pool() self._map = PreshMap(2 ** 20) self.strings = StringStore() self.lexemes.push_back(&EMPTY_LEXEME) self.size = 2 - self.set_flags = set_flags + self.get_lex_props = get_props cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme @@ -267,9 +267,10 @@ cdef class Lexicon: return lex if string.n < 3: mem = self.mem + cdef unicode py_string = string.chars[:string.n] lex = mem.alloc(sizeof(Lexeme), 1) - lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, - self.strings, {'flags': self.set_flags(string.chars[:string.n])}) + lex[0] = lexeme_init(self.size, py_string, string.key, self.strings, + self.get_lex_props(py_string)) if mem is self.mem: self._map.set(string.key, lex) while self.lexemes.size() < (lex.id + 1): diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index e35bde61e..f524188ed 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -72,17 +72,14 @@ cpdef enum attr_id_t: ID SIC - STEM DENSE SHAPE - ASCIIED PREFIX SUFFIX LENGTH CLUSTER POS_TYPE - SENSE_TYPE cdef struct Lexeme: @@ -90,20 +87,16 @@ cdef struct Lexeme: attr_t id attr_t sic - attr_t stem attr_t dense attr_t shape - attr_t asciied attr_t prefix attr_t suffix attr_t length attr_t cluster attr_t pos_type - attr_t sense_type float prob - float lower_pc float sentiment @@ -127,12 +120,8 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil: return lex.sic elif feat_name == DENSE: return lex.dense - elif feat_name == STEM: - return lex.stem elif feat_name == SHAPE: return lex.shape - elif feat_name == ASCIIED: - return lex.asciied elif feat_name == PREFIX: return lex.prefix elif feat_name == SUFFIX: @@ -143,7 +132,5 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil: return lex.cluster elif feat_name == POS_TYPE: return lex.pos_type - elif feat_name == SENSE_TYPE: - return lex.sense_type else: return 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index cd92c4845..f1974cbc9 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -19,17 +19,12 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed, lex.cluster = props.get('cluster', 0) lex.pos_type = props.get('pos_type', 0) - lex.sense_type = props.get('sense_type', 0) lex.prob = props.get('prob', 0) - lex.lower_pc = props.get('lower_pc', 0.0) - lex.prefix = string_store[string[:1]] lex.suffix = string_store[string[-3:]] lex.shape = string_store[orth.word_shape(string)] - lex.dense = lex.sic if lex.prob >= -10 else lex.shape - lex.stem = string_store[props.get('stem', string)] - lex.asciied = string_store[orth.asciied(string)] + lex.dense = string_store[props['dense']] lex.flags = props.get('flags', 0) return lex diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 772086926..39ba7ed41 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -3,25 +3,17 @@ from cymem.cymem cimport Pool from thinc.learner cimport LinearModel from thinc.features cimport Extractor from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t -from preshed.maps cimport PreshMap from .typedefs cimport hash_t from .tokens cimport Tokens -cpdef enum TagType: - POS - SENSE - - cdef class Tagger: - cpdef int set_tags(self, Tokens tokens) except -1 - cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except * + cdef class_t predict(self, atom_t* context, object golds=*) except * cpdef readonly Pool mem cpdef readonly Extractor extractor cpdef readonly LinearModel model - cpdef readonly TagType tag_type cpdef readonly list tag_names cdef dict tagdict diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 04ffef550..e0cd0bf3b 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -2,9 +2,6 @@ from __future__ import unicode_literals from __future__ import division -from .context cimport fill_context -from .context cimport N_FIELDS - from os import path import os import shutil @@ -15,12 +12,11 @@ import cython from thinc.features cimport Feature, count_feats -def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir): +def setup_model_dir(tag_names, tag_counts, templates, model_dir): if path.exists(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) config = { - 'tag_type': tag_type, 'templates': templates, 'tag_names': tag_names, 'tag_counts': tag_counts, @@ -29,29 +25,6 @@ def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir): json.dump(config, file_) -def train(train_sents, model_dir, nr_iter=10): - cdef Tokens tokens - cdef Tagger tagger = Tagger(model_dir) - cdef int i - cdef class_t guess = 0 - cdef class_t gold - for _ in range(nr_iter): - n_corr = 0 - total = 0 - for tokens, golds in train_sents: - assert len(tokens) == len(golds), [t.string for t in tokens] - for i in range(tokens.length): - gold = golds[i] - guess = tagger.predict(i, tokens, [gold]) - tokens.set_tag(i, tagger.tag_type, guess) - total += 1 - n_corr += guess == gold - print('%.4f' % ((n_corr / total) * 100)) - random.shuffle(train_sents) - tagger.model.end_training() - tagger.model.dump(path.join(model_dir, 'model')) - - cdef class Tagger: """Assign part-of-speech, named entity or supersense tags, using greedy decoding. The tagger reads its model and configuration from disk. @@ -61,26 +34,13 @@ cdef class Tagger: cfg = json.load(open(path.join(model_dir, 'config.json'))) templates = cfg['templates'] self.tag_names = cfg['tag_names'] - self.tag_type = cfg['tag_type'] self.tagdict = _make_tag_dict(cfg['tag_counts']) self.extractor = Extractor(templates) self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2) if path.exists(path.join(model_dir, 'model')): self.model.load(path.join(model_dir, 'model')) - cpdef int set_tags(self, Tokens tokens) except -1: - """Assign tags to a Tokens object. - - >>> tokens = EN.tokenize(u'An example sentence.') - >>> assert tokens[0].pos == 'NO_TAG' - >>> EN.pos_tagger.set_tags(tokens) - >>> assert tokens[0].pos == 'DT' - """ - cdef int i - for i in range(tokens.length): - tokens.set_tag(i, self.tag_type, self.predict(i, tokens)) - - cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except *: + cdef class_t predict(self, atom_t* context, object golds=None) except *: """Predict the tag of tokens[i]. The tagger remembers the features and prediction, in case you later call tell_answer. @@ -88,11 +48,6 @@ cdef class Tagger: >>> tag = EN.pos_tagger.predict(0, tokens) >>> assert tag == EN.pos_tagger.tag_id('DT') == 5 """ - cdef atom_t sic = tokens.data[i].lex.sic - if sic in self.tagdict: - return self.tagdict[sic] - cdef atom_t[N_FIELDS] context - fill_context(context, i, tokens.data) cdef int n_feats cdef Feature* feats = self.extractor.get_feats(context, &n_feats) cdef weight_t* scores = self.model.get_scores(feats, n_feats) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 407ffcb8b..33f265eef 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -140,11 +140,11 @@ cdef class Token: self.cluster = lex['cluster'] self.length = lex['length'] self.postype = lex['pos_type'] - self.sensetype = lex['sense_type'] + self.sensetype = 0 self.sic = lex['sic'] self.norm = lex['dense'] self.shape = lex['shape'] - self.suffix = lex['asciied'] + self.suffix = lex['suffix'] self.prefix = lex['prefix'] self.prob = lex['prob'] From b031c7c4306ed348dc371ea28f3e4c2759845fcb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 23:53:01 +1100 Subject: [PATCH 38/56] * Remove language-general context module --- spacy/context.pxd | 1 - spacy/context.pyx | 1 - 2 files changed, 2 deletions(-) delete mode 100644 spacy/context.pxd delete mode 100644 spacy/context.pyx diff --git a/spacy/context.pxd b/spacy/context.pxd deleted file mode 100644 index 8b1378917..000000000 --- a/spacy/context.pxd +++ /dev/null @@ -1 +0,0 @@ - diff --git a/spacy/context.pyx b/spacy/context.pyx deleted file mode 100644 index 8b1378917..000000000 --- a/spacy/context.pyx +++ /dev/null @@ -1 +0,0 @@ - From c20dd79748e928384722df113473d207b26a7893 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Dec 2014 00:03:55 +1100 Subject: [PATCH 39/56] * Fiddle with const correctness and comments --- spacy/tagger.pxd | 2 +- spacy/tagger.pyx | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 39ba7ed41..f91bbeb0a 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -9,7 +9,7 @@ from .tokens cimport Tokens cdef class Tagger: - cdef class_t predict(self, atom_t* context, object golds=*) except * + cdef class_t predict(self, const atom_t* context, object golds=*) except * cpdef readonly Pool mem cpdef readonly Extractor extractor diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index e0cd0bf3b..22ec3896a 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -26,8 +26,8 @@ def setup_model_dir(tag_names, tag_counts, templates, model_dir): cdef class Tagger: - """Assign part-of-speech, named entity or supersense tags, using greedy - decoding. The tagger reads its model and configuration from disk. + """Predict some type of tag, using greedy decoding. The tagger reads its + model and configuration from disk. """ def __init__(self, model_dir): self.mem = Pool() @@ -40,7 +40,7 @@ cdef class Tagger: if path.exists(path.join(model_dir, 'model')): self.model.load(path.join(model_dir, 'model')) - cdef class_t predict(self, atom_t* context, object golds=None) except *: + cdef class_t predict(self, const atom_t* context, object golds=None) except *: """Predict the tag of tokens[i]. The tagger remembers the features and prediction, in case you later call tell_answer. From 7b68f911cf882d5f2694eb7ea26eddf37b9c9070 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Dec 2014 01:39:13 +1100 Subject: [PATCH 40/56] * Add WordNet lemmatizer --- spacy/lemmatizer.py | 87 +++++++++++++++++++++++++ tests/{test_ner.py => depr_test_ner.py} | 0 tests/test_lemmatizer.py | 34 ++++++++++ 3 files changed, 121 insertions(+) create mode 100644 spacy/lemmatizer.py rename tests/{test_ner.py => depr_test_ner.py} (100%) create mode 100644 tests/test_lemmatizer.py diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py new file mode 100644 index 000000000..a42a5daee --- /dev/null +++ b/spacy/lemmatizer.py @@ -0,0 +1,87 @@ +from os import path + + +NOUN_RULES = ( + ('s', ''), + ('ses', 's'), + ('ves', 'f'), + ('xes', 'x'), + ('zes', 'z'), + ('ches', 'ch'), + ('shes', 'sh'), + ('men', 'man'), + ('ies', 'y') +) + + +VERB_RULES = ( + ("s", ""), + ("ies", "y"), + ("es", "e"), + ("es", ""), + ("ed", "e"), + ("ed", ""), + ("ing", "e"), + ("ing", "") +) + + +ADJ_RULES = ( + ("er", ""), + ("est", ""), + ("er", "e"), + ("est", "e") +) + + +class Lemmatizer(object): + def __init__(self, wn_dict_dir): + self.index = {} + self.exc = {} + for pos in ['adj', 'adv', 'noun', 'verb']: + self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos)) + self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) + + def noun(self, string): + return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES) + + def verb(self, string): + return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES) + + def adj(self, string): + return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES) + + +def lemmatize(string, index, exceptions, rules): + forms = [] + if string in index: + forms.append(string) + forms.extend(exceptions.get(string, [])) + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if form in index: + forms.append(form) + return set(forms) + + +def read_index(loc): + index = set() + for line in open(loc): + if line.startswith(' '): + continue + pieces = line.split() + word = pieces[0] + if word.count('_') == 0: + index.add(word) + return index + + +def read_exc(loc): + exceptions = {} + for line in open(loc): + if line.startswith(' '): + continue + pieces = line.split() + exceptions[pieces[0]] = tuple(pieces[1:]) + return exceptions diff --git a/tests/test_ner.py b/tests/depr_test_ner.py similarity index 100% rename from tests/test_ner.py rename to tests/depr_test_ner.py diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py new file mode 100644 index 000000000..2047e4d2c --- /dev/null +++ b/tests/test_lemmatizer.py @@ -0,0 +1,34 @@ +from spacy.lemmatizer import Lemmatizer, read_index, read_exc +from spacy.util import DATA_DIR +from os import path + +import pytest + + +def test_read_index(): + wn = path.join(DATA_DIR, 'wordnet') + index = read_index(path.join(wn, 'index.noun')) + assert 'man' in index + assert 'plantes' not in index + assert 'plant' in index + + +def test_read_exc(): + wn = path.join(DATA_DIR, 'wordnet') + exc = read_exc(path.join(wn, 'verb.exc')) + assert exc['was'] == ('be',) + + +@pytest.fixture +def lemmatizer(): + return Lemmatizer(path.join(DATA_DIR, 'wordnet')) + + +def test_noun_lemmas(lemmatizer): + do = lemmatizer.noun + + assert do('aardwolves') == set(['aardwolf']) + assert do('aardwolf') == set(['aardwolf']) + assert do('planets') == set(['planet']) + assert do('ring') == set(['ring']) + assert do('axes') == set(['axis', 'axe', 'ax']) From 99bbbb6febf689250df0143394a82eb6177a5be2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Dec 2014 21:12:15 +1100 Subject: [PATCH 41/56] * Work on morphological processing --- spacy/en.pxd | 51 +++++++++++++++++++++++++++++++ spacy/en.pyx | 73 +++++++++++++++++++++++++++++++++++++++++---- spacy/lang.pxd | 8 +++-- spacy/lang.pyx | 39 +++++++++++++++++++++--- spacy/lemmatizer.py | 3 ++ spacy/pos_util.py | 3 +- spacy/tagger.pxd | 32 +++++++++++++++++++- spacy/tagger.pyx | 42 +++++++++++++++++++++++--- spacy/tokens.pxd | 17 ++++++++++- spacy/tokens.pyx | 14 +++++++-- 10 files changed, 261 insertions(+), 21 deletions(-) diff --git a/spacy/en.pxd b/spacy/en.pxd index 8ce023106..6887dbc08 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -5,6 +5,57 @@ from .tokens cimport Tokens from .tokens cimport TokenC +cpdef enum en_person_t: + NO_PERSON + FIRST + SECOND + THIRD + + +cpdef enum en_number_t: + NO_NUMBER + SINGULAR + PLURAL + MASS + CARDINAL + ORDINAL + + +cpdef enum en_gender_t: + NO_GENDER + MASCULINE + FEMININE + + +cpdef enum en_tenspect_t: + NO_TENSE + BASE_VERB + PRESENT + PAST + PASSIVE + ING + MODAL + + +cpdef enum en_case_t: + NO_CASE + NOMINATIVE + ACCUSATIVE + GENITIVE + DEMONYM + + +cpdef enum misc_t: + NO_MISC + COMPARATIVE + SUPERLATIVE + RELATIVE + NAME + URL + EMAIL + EMOTICON + + # Flags cpdef enum FlagID: IS_ALPHA diff --git a/spacy/en.pyx b/spacy/en.pyx index c0eb0368b..fa59ef933 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -35,6 +35,63 @@ from __future__ import unicode_literals cimport lang from .typedefs cimport flags_t import orth +from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB +from .tagger cimport X, PUNCT, EOL + + +POS_TAGS = { + 'NULL': (NO_TAG, {}), + 'EOL': (EOL, {}), + 'CC': (CONJ, {}), + 'CD': (NUM, {}), + 'DT': (DET, {}), + 'EX': (DET, {}), + 'FW': (X, {}), + 'IN': (ADP, {}), + 'JJ': (ADJ, {}), + 'JJR': (ADJ, {'misc': COMPARATIVE}), + 'JJS': (ADJ, {'misc': SUPERLATIVE}), + 'LS': (X, {}), + 'MD': (VERB, {'tenspect': MODAL}), + 'NN': (NOUN, {}), + 'NNS': (NOUN, {'number': PLURAL}), + 'NNP': (NOUN, {'misc': NAME}), + 'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}), + 'PDT': (DET, {}), + 'POS': (PRT, {'case': GENITIVE}), + 'PRP': (NOUN, {}), + 'PRP$': (NOUN, {'case': GENITIVE}), + 'RB': (ADV, {}), + 'RBR': (ADV, {'misc': COMPARATIVE}), + 'RBS': (ADV, {'misc': SUPERLATIVE}), + 'RP': (PRT, {}), + 'SYM': (X, {}), + 'TO': (PRT, {}), + 'UH': (X, {}), + 'VB': (VERB, {}), + 'VBD': (VERB, {'tenspect': PAST}), + 'VBG': (VERB, {'tenspect': ING}), + 'VBN': (VERB, {'tenspect': PASSIVE}), + 'VBP': (VERB, {'tenspect': PRESENT}), + 'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}), + 'WDT': (DET, {'misc': RELATIVE}), + 'WP': (PRON, {'misc': RELATIVE}), + 'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}), + 'WRB': (ADV, {'misc': RELATIVE}), + '!': (PUNCT, {}), + '#': (PUNCT, {}), + '$': (PUNCT, {}), + "''": (PUNCT, {}), + "(": (PUNCT, {}), + ")": (PUNCT, {}), + "-LRB-": (PUNCT, {}), + "-RRB-": (PUNCT, {}), + ".": (PUNCT, {}), + ",": (PUNCT, {}), + "``": (PUNCT, {}), + ":": (PUNCT, {}), + "?": (PUNCT, {}), +} POS_TEMPLATES = ( @@ -91,19 +148,25 @@ cdef class English(Language): def set_pos(self, Tokens tokens): cdef int i cdef atom_t[N_CONTEXT_FIELDS] context + cdef TokenC* t = tokens.data for i in range(tokens.length): - fill_pos_context(context, i, tokens.data) - tokens.data[i].pos = self.pos_tagger.predict(context) + fill_pos_context(context, i, t) + t[i].pos = self.pos_tagger.predict(context) + #self.morphalyser.set_token(&t[i]) def train_pos(self, Tokens tokens, golds): cdef int i cdef atom_t[N_CONTEXT_FIELDS] context c = 0 + cdef TokenC* t = tokens.data for i in range(tokens.length): - fill_pos_context(context, i, tokens.data) - tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]]) - c += tokens.data[i].pos == golds[i] + fill_pos_context(context, i, t) + t[i].pos = self.pos_tagger.predict(context, [golds[i]]) + t[i].morph = self.pos_tagger.tags[t[i].pos].morph + #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex) + c += t[i].pos == golds[i] return c + EN = English('en') diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 20374f40d..124281a6b 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -2,20 +2,20 @@ from libcpp.vector cimport vector from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER -from preshed.maps cimport PreshMap +from preshed.maps cimport PreshMap, PreshMapArray from cymem.cymem cimport Pool from .typedefs cimport hash_t from .tokens cimport Tokens, TokenC from .lexeme cimport Lexeme from .tagger cimport Tagger +from .tagger cimport PosTag from .utf8string cimport StringStore, UniStr cdef class Lexicon: cpdef public get_lex_props cdef Pool mem - cpdef readonly size_t size cpdef readonly StringStore strings cdef vector[Lexeme*] lexemes @@ -29,13 +29,17 @@ cdef class Language: cdef readonly unicode name cdef PreshMap _cache cdef PreshMap _specials + cdef PreshMapArray _lemmas cpdef readonly Lexicon lexicon cpdef readonly Tagger pos_tagger + cpdef readonly object lemmatizer cdef object _prefix_re cdef object _suffix_re cdef object _infix_re + cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1 + cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokenize(self, unicode text) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 496c6742c..fdeb7df66 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -14,6 +14,7 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 from preshed.maps cimport PreshMap +from .lemmatizer import Lemmatizer from .lexeme cimport Lexeme from .lexeme cimport EMPTY_LEXEME @@ -26,6 +27,8 @@ from . import util from .util import read_lang_data from .tokens import Tokens +from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS + cdef class Language: def __init__(self, name): @@ -39,14 +42,40 @@ cdef class Language: self._infix_re = re.compile(infix) self.lexicon = Lexicon(self.get_props) self._load_special_tokenization(rules) + self._lemmas = PreshMapArray(N_UNIV_TAGS) self.pos_tagger = None + self.lemmatizer = None def load(self): + self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet')) self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes')) self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings')) if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) + cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1: + if self.lemmatizer is None: + return lex.sic + if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ: + return lex.sic + cdef int lemma = self._lemmas.get(pos.pos, lex.sic) + if lemma != 0: + return lemma + cdef bytes py_string = self.lexicon.strings[lex.sic] + cdef set lemma_strings + cdef bytes lemma_string + if pos.pos == NOUN: + lemma_strings = self.lemmatizer.noun(py_string) + elif pos.pos == VERB: + lemma_strings = self.lemmatizer.verb(py_string) + else: + assert pos.pos == ADJ + lemma_strings = self.lemmatizer.adj(py_string) + lemma_string = sorted(lemma_strings)[0] + lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i + self._lemmas.set(pos.pos, lex.sic, lemma) + return lemma + cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) cdef Tokens tokens = Tokens(self.lexicon.strings, length) @@ -254,9 +283,11 @@ cdef class Lexicon: self._map = PreshMap(2 ** 20) self.strings = StringStore() self.lexemes.push_back(&EMPTY_LEXEME) - self.size = 2 self.get_lex_props = get_props + def __len__(self): + return self.lexemes.size() + cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool @@ -269,14 +300,13 @@ cdef class Lexicon: mem = self.mem cdef unicode py_string = string.chars[:string.n] lex = mem.alloc(sizeof(Lexeme), 1) - lex[0] = lexeme_init(self.size, py_string, string.key, self.strings, + lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings, self.get_lex_props(py_string)) if mem is self.mem: self._map.set(string.key, lex) while self.lexemes.size() < (lex.id + 1): self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes[lex.id] = lex - self.size += 1 else: lex[0].id = 1 return lex @@ -302,6 +332,8 @@ cdef class Lexicon: a dict if the operator is called from Python. ''' if type(id_or_string) == int: + if id_or_string >= self.lexemes.size(): + raise IndexError return self.lexemes.at(id_or_string)[0] cdef UniStr string slice_unicode(&string, id_or_string, 0, len(id_or_string)) @@ -359,5 +391,4 @@ cdef class Lexicon: self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes[lexeme.id] = lexeme i += 1 - self.size += 1 fclose(fp) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index a42a5daee..ce9bbefdc 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -53,6 +53,7 @@ class Lemmatizer(object): def lemmatize(string, index, exceptions, rules): + string = string.lower() forms = [] if string in index: forms.append(string) @@ -62,6 +63,8 @@ def lemmatize(string, index, exceptions, rules): form = string[:len(string) - len(old)] + new if form in index: forms.append(form) + if not forms: + forms.append(string) return set(forms) diff --git a/spacy/pos_util.py b/spacy/pos_util.py index e5716665e..489f03dde 100644 --- a/spacy/pos_util.py +++ b/spacy/pos_util.py @@ -147,6 +147,7 @@ Y PRT Z NOUN ^ NOUN ~ X -`` .""".strip().split('\n')) +`` . +EOL EOL""".strip().split('\n')) return mapping[tag] diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index f91bbeb0a..11880bf13 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -1,11 +1,40 @@ +from libc.stdint cimport uint8_t + from cymem.cymem cimport Pool from thinc.learner cimport LinearModel from thinc.features cimport Extractor from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t +from preshed.maps cimport PreshMapArray + from .typedefs cimport hash_t -from .tokens cimport Tokens +from .tokens cimport Tokens, Morphology + + +# Google universal tag set +cdef enum univ_tag_t: + NO_TAG + ADJ + ADV + ADP + CONJ + DET + NOUN + NUM + PRON + PRT + VERB + X + PUNCT + EOL + N_UNIV_TAGS + + +cdef struct PosTag: + Morphology morph + int id + univ_tag_t pos cdef class Tagger: @@ -16,4 +45,5 @@ cdef class Tagger: cpdef readonly LinearModel model cpdef readonly list tag_names + cdef PosTag* tags cdef dict tagdict diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 22ec3896a..db7974d91 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -12,13 +12,14 @@ import cython from thinc.features cimport Feature, count_feats -def setup_model_dir(tag_names, tag_counts, templates, model_dir): +def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir): if path.exists(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) config = { 'templates': templates, 'tag_names': tag_names, + 'tag_map': tag_map, 'tag_counts': tag_counts, } with open(path.join(model_dir, 'config.json'), 'w') as file_: @@ -33,16 +34,31 @@ cdef class Tagger: self.mem = Pool() cfg = json.load(open(path.join(model_dir, 'config.json'))) templates = cfg['templates'] + tag_map = cfg['tag_map'] + univ_counts = {} + cdef unicode tag + cdef unicode univ_tag self.tag_names = cfg['tag_names'] + self.tags = self.mem.alloc(len(self.tag_names), sizeof(PosTag)) + for i, tag in enumerate(self.tag_names): + pos, props = tag_map[tag] + self.tags[i].id = i + self.tags[i].pos = pos + self.tags[i].morph.number = props.get('number', 0) + self.tags[i].morph.tenspect = props.get('tenspect', 0) + self.tags[i].morph.mood = props.get('mood', 0) + self.tags[i].morph.gender = props.get('gender', 0) + self.tags[i].morph.person = props.get('person', 0) + self.tags[i].morph.case = props.get('case', 0) + self.tags[i].morph.misc = props.get('misc', 0) self.tagdict = _make_tag_dict(cfg['tag_counts']) self.extractor = Extractor(templates) self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2) if path.exists(path.join(model_dir, 'model')): self.model.load(path.join(model_dir, 'model')) - cdef class_t predict(self, const atom_t* context, object golds=None) except *: - """Predict the tag of tokens[i]. The tagger remembers the features and - prediction, in case you later call tell_answer. + cdef class_t predict(self, atom_t* context, object golds=None) except *: + """Predict the tag of tokens[i]. >>> tokens = EN.tokenize(u'An example sentence.') >>> tag = EN.pos_tagger.predict(0, tokens) @@ -69,6 +85,24 @@ cdef class Tagger: return tag_id +UNIV_TAGS = { + 'NULL': NO_TAG, + 'ADJ': ADJ, + 'ADV': ADV, + 'ADP': ADP, + 'CONJ': CONJ, + 'DET': DET, + 'NOUN': NOUN, + 'NUM': NUM, + 'PRON': PRON, + 'PRT': PRT, + 'VERB': VERB, + 'X': X, + '.': PUNCT, + 'EOL': EOL +} + + def _make_tag_dict(counts): freq_thresh = 50 ambiguity_thresh = 0.98 diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index e6bc0a46a..6f4691716 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -5,14 +5,29 @@ from cymem.cymem cimport Pool from thinc.typedefs cimport atom_t from .lexeme cimport Lexeme + from .typedefs cimport flags_t from .utf8string cimport StringStore +from libc.stdint cimport uint8_t, uint16_t + + +cdef struct Morphology: + uint8_t number + uint8_t tenspect # Tense/aspect/voice + uint8_t mood + uint8_t gender + uint8_t person + uint8_t case + uint8_t misc + cdef struct TokenC: const Lexeme* lex + Morphology morph int idx int pos + int lemma int sense @@ -37,7 +52,7 @@ cdef class Token: cdef public int i cdef public int idx cdef public int pos - cdef public int ner + cdef int lemma cdef public atom_t id cdef public atom_t cluster diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 33f265eef..004d0578c 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -51,7 +51,7 @@ cdef class Tokens: def __getitem__(self, i): bounds_check(i, self.length, PADDING) return Token(self._string_store, i, self.data[i].idx, self.data[i].pos, - self.data[i].sense, self.data[i].lex[0]) + self.data[i].lemma, self.data[i].lex[0]) def __iter__(self): for i in range(self.length): @@ -128,14 +128,15 @@ cdef class Tokens: @cython.freelist(64) cdef class Token: - def __init__(self, StringStore string_store, int i, int idx, int pos, int ner, + def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma, dict lex): self._string_store = string_store self.idx = idx self.pos = pos - self.ner = ner self.i = i self.id = lex['id'] + + self.lemma = lemma self.cluster = lex['cluster'] self.length = lex['length'] @@ -156,3 +157,10 @@ cdef class Token: return '' cdef bytes utf8string = self._string_store[self.sic] return utf8string.decode('utf8') + + property lemma: + def __get__(self): + if self.lemma == 0: + return self.string + cdef bytes utf8string = self._string_store[self.lemma] + return utf8string.decode('utf8') From cda9ea9a4af9d85fa590a7272649f6524a62df77 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Dec 2014 21:12:51 +1100 Subject: [PATCH 42/56] * Add test to make sure iterating over the lexicon isnt broken --- tests/test_iter_lexicon.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/test_iter_lexicon.py diff --git a/tests/test_iter_lexicon.py b/tests/test_iter_lexicon.py new file mode 100644 index 000000000..379ebd3bb --- /dev/null +++ b/tests/test_iter_lexicon.py @@ -0,0 +1,15 @@ +import pytest + +from spacy.en import EN + +def test_range_iter(): + EN.load() + for i in range(len(EN.lexicon)): + lex = EN.lexicon[i] + + +def test_iter(): + EN.load() + i = 0 + for lex in EN.lexicon: + i += 1 From 302e09018bc090a32f45170d208bb2b6898ac185 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 14:48:01 +1100 Subject: [PATCH 43/56] * Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas --- spacy/en.pxd | 24 ++++---- spacy/en.pyx | 25 ++++++++- spacy/lang.pxd | 5 +- spacy/lang.pyx | 112 +++++++++++++++++++++++-------------- spacy/tokens.pxd | 1 - spacy/util.py | 10 +++- tests/test_contractions.py | 12 ++-- tests/test_tokenizer.py | 17 ++++-- 8 files changed, 136 insertions(+), 70 deletions(-) diff --git a/spacy/en.pxd b/spacy/en.pxd index 6887dbc08..cee754d9c 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -10,6 +10,7 @@ cpdef enum en_person_t: FIRST SECOND THIRD + NON_THIRD cpdef enum en_number_t: @@ -17,14 +18,22 @@ cpdef enum en_number_t: SINGULAR PLURAL MASS - CARDINAL - ORDINAL cpdef enum en_gender_t: NO_GENDER MASCULINE FEMININE + NEUTER + + +cpdef enum en_case_t: + NO_CASE + NOMINATIVE + GENITIVE + ACCUSATIVE + REFLEXIVE + DEMONYM cpdef enum en_tenspect_t: @@ -37,23 +46,12 @@ cpdef enum en_tenspect_t: MODAL -cpdef enum en_case_t: - NO_CASE - NOMINATIVE - ACCUSATIVE - GENITIVE - DEMONYM - - cpdef enum misc_t: NO_MISC COMPARATIVE SUPERLATIVE RELATIVE NAME - URL - EMAIL - EMOTICON # Flags diff --git a/spacy/en.pyx b/spacy/en.pyx index fa59ef933..0136818f2 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -38,6 +38,8 @@ import orth from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB from .tagger cimport X, PUNCT, EOL +from .tokens cimport Morphology + POS_TAGS = { 'NULL': (NO_TAG, {}), @@ -152,7 +154,8 @@ cdef class English(Language): for i in range(tokens.length): fill_pos_context(context, i, t) t[i].pos = self.pos_tagger.predict(context) - #self.morphalyser.set_token(&t[i]) + _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) + t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) def train_pos(self, Tokens tokens, golds): cdef int i @@ -162,11 +165,27 @@ cdef class English(Language): for i in range(tokens.length): fill_pos_context(context, i, t) t[i].pos = self.pos_tagger.predict(context, [golds[i]]) - t[i].morph = self.pos_tagger.tags[t[i].pos].morph - #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex) + _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) + t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) c += t[i].pos == golds[i] return c +cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1: + if tok_morph.number == 0: + tok_morph.number = pos_morph.number + if tok_morph.tenspect == 0: + tok_morph.tenspect = pos_morph.tenspect + if tok_morph.mood == 0: + tok_morph.mood = pos_morph.mood + if tok_morph.gender == 0: + tok_morph.gender = pos_morph.gender + if tok_morph.person == 0: + tok_morph.person = pos_morph.person + if tok_morph.case == 0: + tok_morph.case = pos_morph.case + if tok_morph.misc == 0: + tok_morph.misc = pos_morph.misc + EN = English('en') diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 124281a6b..0307e12fe 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -9,7 +9,7 @@ from .typedefs cimport hash_t from .tokens cimport Tokens, TokenC from .lexeme cimport Lexeme from .tagger cimport Tagger -from .tagger cimport PosTag +from .tagger cimport univ_tag_t from .utf8string cimport StringStore, UniStr @@ -38,11 +38,12 @@ cdef class Language: cdef object _suffix_re cdef object _infix_re - cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1 + cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokenize(self, unicode text) + cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1 cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except NULL diff --git a/spacy/lang.pyx b/spacy/lang.pyx index fdeb7df66..cdae8644a 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -28,6 +28,7 @@ from .util import read_lang_data from .tokens import Tokens from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS +from .tokens cimport Morphology cdef class Language: @@ -53,27 +54,27 @@ cdef class Language: if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) - cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1: + cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: if self.lemmatizer is None: return lex.sic - if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ: + if pos != NOUN and pos != VERB and pos != ADJ: return lex.sic - cdef int lemma = self._lemmas.get(pos.pos, lex.sic) + cdef int lemma = self._lemmas.get(pos, lex.sic) if lemma != 0: return lemma cdef bytes py_string = self.lexicon.strings[lex.sic] cdef set lemma_strings cdef bytes lemma_string - if pos.pos == NOUN: + if pos == NOUN: lemma_strings = self.lemmatizer.noun(py_string) - elif pos.pos == VERB: + elif pos == VERB: lemma_strings = self.lemmatizer.verb(py_string) else: - assert pos.pos == ADJ + assert pos == ADJ lemma_strings = self.lemmatizer.adj(py_string) lemma_string = sorted(lemma_strings)[0] lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i - self._lemmas.set(pos.pos, lex.sic, lemma) + self._lemmas.set(pos, lex.sic, lemma) return lemma cpdef Tokens tokens_from_list(self, list strings): @@ -111,6 +112,7 @@ cdef class Language: return tokens cdef int i = 0 cdef int start = 0 + cdef bint cache_hit cdef Py_UNICODE* chars = string cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) cdef UniStr span @@ -118,10 +120,8 @@ cdef class Language: if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if start < i: slice_unicode(&span, chars, start, i) - lexemes = self._cache.get(span.key) - if lexemes != NULL: - tokens.extend(start, lexemes, 0) - else: + cache_hit = self._try_cache(start, span.key, tokens) + if not cache_hit: self._tokenize(tokens, &span, start, i) in_ws = not in_ws start = i @@ -130,13 +130,32 @@ cdef class Language: i += 1 if start < i: slice_unicode(&span, chars, start, i) - lexemes = self._cache.get(span.key) - if lexemes != NULL: - tokens.extend(start, lexemes, 0) - else: + cache_hit = self._try_cache(start, span.key, tokens) + if not cache_hit: self._tokenize(tokens, &span, start, i) return tokens + cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: + cdef int i + specials = self._specials.get(key) + if specials != NULL: + i = 0 + while specials[i].lex != NULL: + tokens.push_back(idx, specials[i].lex) + tokens.data[tokens.length - 1].pos = specials[i].pos + tokens.data[tokens.length - 1].morph = specials[i].morph + tokens.data[tokens.length - 1].lemma = specials[i].lemma + tokens.data[tokens.length - 1].sense = specials[i].sense + i += 1 + return True + else: + cached = self._cache.get(key) + if cached != NULL: + tokens.extend(i, cached, 0) + return True + else: + return False + cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: cdef vector[Lexeme*] prefixes cdef vector[Lexeme*] suffixes @@ -190,10 +209,10 @@ cdef class Language: break return string - cdef int _attach_tokens(self, Tokens tokens, - int idx, UniStr* string, + cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, vector[const Lexeme*] *prefixes, vector[const Lexeme*] *suffixes) except -1: + cdef bint cache_hit cdef int split cdef const Lexeme* const* lexemes cdef Lexeme* lexeme @@ -201,10 +220,9 @@ cdef class Language: if prefixes.size(): idx = tokens.extend(idx, prefixes.data(), prefixes.size()) if string.n != 0: - - lexemes = self._cache.get(string.key) - if lexemes != NULL: - idx = tokens.extend(idx, lexemes, 0) + cache_hit = self._try_cache(idx, string.key, tokens) + if cache_hit: + idx = tokens.data[tokens.length - 1].idx + 1 else: split = self._find_infix(string.chars, string.n) if split == 0 or split == -1: @@ -247,30 +265,42 @@ cdef class Language: match = self._suffix_re.search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, token_rules): - '''Load special-case tokenization rules. - - Loads special-case tokenization rules into the Language._cache cache, - read from data//tokenization . The special cases are loaded before - any language data is tokenized, giving these priority. For instance, - the English tokenization rules map "ain't" to ["are", "not"]. - - Args: - token_rules (list): A list of (chunk, tokens) pairs, where chunk is - a string and tokens is a list of strings. + def _load_special_tokenization(self, object rules): + '''Add a special-case tokenization rule. ''' + cdef int i + cdef unicode chunk + cdef list substrings + cdef unicode form + cdef unicode lemma + cdef dict props cdef Lexeme** lexemes cdef hash_t hashed cdef UniStr string - for uni_string, substrings in token_rules: - lexemes = self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) - for i, substring in enumerate(substrings): - slice_unicode(&string, substring, 0, len(substring)) - lexemes[i] = self.lexicon.get(self.lexicon.mem, &string) - lexemes[i + 1] = NULL - slice_unicode(&string, uni_string, 0, len(uni_string)) - self._specials.set(string.key, lexemes) - self._cache.set(string.key, lexemes) + for chunk, substrings in sorted(rules.items()): + tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) + for i, props in enumerate(substrings): + form = props['F'] + lemma = props.get("L", None) + slice_unicode(&string, form, 0, len(form)) + tokens[i].lex = self.lexicon.get(self.lexicon.mem, &string) + if lemma: + tokens[i].lemma = self.lexicon.strings[lemma] + set_morph_from_dict(&tokens[i].morph, props) + # Null-terminated array + tokens[i+1].lex = NULL + slice_unicode(&string, chunk, 0, len(chunk)) + self._specials.set(string.key, tokens) + + +cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: + morph.number = props.get('number', 0) + morph.tenspect = props.get('tenspect', 0) + morph.mood = props.get('mood', 0) + morph.gender = props.get('gender', 0) + morph.person = props.get('person', 0) + morph.case = props.get('case', 0) + morph.misc = props.get('misc', 0) cdef class Lexicon: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 6f4691716..f3d6011ec 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -21,7 +21,6 @@ cdef struct Morphology: uint8_t misc - cdef struct TokenC: const Lexeme* lex Morphology morph diff --git a/spacy/util.py b/spacy/util.py index 5062ca6db..ff03760a5 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -13,7 +13,8 @@ def utf8open(loc, mode='r'): def read_lang_data(name): data_dir = path.join(DATA_DIR, name) - tokenization = read_tokenization(name) + with open(path.join(data_dir, 'specials.json')) as file_: + tokenization = ujson.load(file_) prefix = read_prefix(data_dir) suffix = read_suffix(data_dir) infix = read_infix(data_dir) @@ -26,12 +27,17 @@ def read_prefix(data_dir): expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return expression + def read_suffix(data_dir): - with utf8open(path.join(data_dir, 'suffix')) as file_: + with utf8open(path.join(data_dir, 'suffix')) as file_: entries = file_.read().split('\n') expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()]) + # TODO: Fix this hack! + expression += r'|(?<=[a-z0-9])\.$' + expression += r'|(?<=[0-9])km$' return expression + def read_infix(data_dir): with utf8open(path.join(data_dir, 'infix')) as file_: entries = file_.read().split('\n') diff --git a/tests/test_contractions.py b/tests/test_contractions.py index 8334a74a9..1e697afd2 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -20,15 +20,18 @@ def test_apostrophe(): def test_LL(): tokens = EN.tokenize("we'll") assert len(tokens) == 2 - assert tokens[1].string == "will" + assert tokens[1].string == "'ll" + assert tokens[1].lemma == "will" assert tokens[0].string == "we" def test_aint(): tokens = EN.tokenize("ain't") assert len(tokens) == 2 - assert tokens[0].string == "are" - assert tokens[1].string == "not" + assert tokens[0].string == "ai" + assert tokens[0].lemma == "be" + assert tokens[1].string == "n't" + assert tokens[1].lemma == "not" def test_capitalized(): @@ -38,7 +41,8 @@ def test_capitalized(): assert len(tokens) == 2 tokens = EN.tokenize("Ain't") assert len(tokens) == 2 - assert tokens[0].string == "Are" + assert tokens[0].string == "Ai" + assert tokens[0].lemma == "be" def test_punct(): diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index fb5f78ed7..21d115b9b 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -34,7 +34,7 @@ def test_digits(): def test_contraction(): tokens = EN.tokenize("don't giggle") assert len(tokens) == 3 - assert tokens[1].sic == EN.lexicon["not"]['sic'] + assert tokens[1].sic == EN.lexicon["n't"]['sic'] tokens = EN.tokenize("i said don't!") assert len(tokens) == 5 assert tokens[4].sic == EN.lexicon['!']['sic'] @@ -71,30 +71,39 @@ def test_cnts1(): tokens = EN.tokenize(text) assert len(tokens) == 8 + def test_cnts2(): text = u"""U.N. regulations are not a part of their concern.""" tokens = EN.tokenize(text) assert len(tokens) == 10 + def test_cnts3(): text = u"“Isn't it?”" tokens = EN.tokenize(text) - assert len(tokens) == 6 + words = [t.string for t in tokens] + assert len(words) == 6 + def test_cnts4(): text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ tokens = EN.tokenize(text) - assert len(tokens) == 15 + words = [t.string for t in tokens] + assert len(words) == 15 + def test_cnts5(): text = """'Me too!', Mr. P. Delaware cried. """ tokens = EN.tokenize(text) assert len(tokens) == 11 + def test_cnts6(): text = u'They ran about 10km.' tokens = EN.tokenize(text) - assert len(tokens) == 6 + words = [t.string for t in tokens] + assert len(words) == 6 + #def test_cnts7(): # text = 'But then the 6,000-year ice age came...' From 2a6bd2818f581b385da3f9871d13663454507781 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 15:18:43 +1100 Subject: [PATCH 44/56] * Load the lexicon before we check flag values --- tests/test_lexeme_flags.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py index 10276d8ea..c1fe2d847 100644 --- a/tests/test_lexeme_flags.py +++ b/tests/test_lexeme_flags.py @@ -7,6 +7,7 @@ from spacy.lexeme import * def test_is_alpha(): + EN.load() the = EN.lexicon['the'] assert the['flags'] & (1 << IS_ALPHA) year = EN.lexicon['1999'] @@ -16,6 +17,7 @@ def test_is_alpha(): def test_is_digit(): + EN.load() the = EN.lexicon['the'] assert not the['flags'] & (1 << IS_DIGIT) year = EN.lexicon['1999'] From 1ccabc806ef8c4575e12439b05f2d7cdb6d4b7e5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 16:06:18 +1100 Subject: [PATCH 45/56] * Work on lemmatization --- data/en/suffix | 1 - data/en/tokenization | 93 -------------------------------------------- 2 files changed, 94 deletions(-) diff --git a/data/en/suffix b/data/en/suffix index 77400d0fd..a6ff32849 100644 --- a/data/en/suffix +++ b/data/en/suffix @@ -16,7 +16,6 @@ $ '' 's 'S -. .. ... .... diff --git a/data/en/tokenization b/data/en/tokenization index e2b78dd28..382b7e383 100644 --- a/data/en/tokenization +++ b/data/en/tokenization @@ -4,99 +4,6 @@ #*---* --- #*'s 's -'s 's -'S 'S -ain't ai n't -aren't are n't -can't ca n't -cannot can not -could've could 've -couldn't could n't -couldn't've could n't 've -didn't did n't -doesn't does n't -don't do n't -hadn't had n't -hadn't've had n't 've -hasn't has n't -haven't have n't -he'd he 'd -he'd've he 'd 've -he'll he 'll -he's he 's -how'd he 'd -how'll he 'll -how's how 's -I'd I 'd -I'd've I 'd 've -I'll I 'll -I'm I 'm -I'ma I 'ma -I've I 've -isn't is n't -it'd it 'd -it'd've it 'd 've -it'll it 'll -it's it 's -let's let 's -mightn't might n't -mightn't've might n't 've -might've might 've -mustn't must n't -must've must 've -needn't need n't -not've not 've -shan't sha n't -she'd she 'd -she'd've she 'd 've -she'll she will -she's she 's -should've should 've -shouldn't should n't -shouldn't've should n't 've -that's that 's -there'd there 'd -there'd've there 'd 've -there's there 's -they'd there 'd -they'd've they 'd 've -they'll they 'll -they're they 're -they've they 've -wasn't was n't -we'd we 'd -we'd've we 'd 've -we'll we 'll -we're we 're -we've we 've -weren't were n't -what'll what 'll -what're what 're -what's what 's -what've what 've -when's when 's -where'd where 'd -where's where 's -where've where 've -who'd who 'd -who'll who 'll -who're who 're -who's who 's -who've who 've -why'll why 'll -why're why 're -why's why 's -won't wo n't -would've would 've -wouldn't would n't -wouldn't've would n't 've -you'd you 'd -you'd've you 'd 've -you'll you 'll -you're you 're -you've you 've -'em 'em -'ol 'ol 10km 10 km U.S. U.S. U.K. U.K. From f15deaad5b5c506797772ded36a3102babfd4435 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 16:08:01 +1100 Subject: [PATCH 46/56] * Upd docs --- docs/source/index.rst | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 808455fd0..fb738aa32 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -10,14 +10,27 @@ spaCy NLP Tokenizer and Lexicon spaCy is a library for industrial-strength NLP in Python and Cython. spaCy's take on NLP is that it's mostly about feature extraction --- that's the part that's specific to NLP, so that's what an NLP library should focus on. -It should tell you what the current best-practice is, and help you do exactly -that, quickly and efficiently. -Best-practice is to **use lots of large lexicons**. Let's say you hit the word -*belieber* in production. What will your system know about this word? A bad -system will only know things about the words in its training corpus, which -probably consists of texts written before Justin Bieber was even born. -It doesn't have to be like that. +spaCy also believes that for NLP, **efficiency is critical**. If you're +running batch jobs, you probably have an enormous amount of data; if you're +serving requests one-by-one, you want lower latency and fewer servers. Even if +you're doing exploratory research on relatively small samples, you should still +value efficiency, because it means you can run more experiments. + +Depending on the task, spaCy is between 10 and 200 times faster than NLTK, +often with much better accuracy. See Benchmarks for details, and +Why is spaCy so fast? for a discussion of the algorithms and implementation +that makes this possible. + ++---------+----------+-------------+----------+ +| System | Tokenize | --> Counts | --> Stem | ++---------+----------+-------------+----------+ +| spaCy | 1m42s | 1m59s | 1m59s | ++---------+----------+-------------+----------+ +| NLTK | 20m2s | 28m24s | 52m28 | ++---------+----------+-------------+----------+ + +Times for 100m words of text. Unique Lexicon-centric design @@ -25,15 +38,14 @@ Unique Lexicon-centric design spaCy helps you build models that generalise better, by making it easy to use more robust features. Instead of a list of strings, the tokenizer returns -references to rich lexical types. Its tokenizer returns sequence of references -to rich lexical types. Features which ask about the word's Brown cluster, its -typical part-of-speech tag, how it's usually cased etc require no extra effort: +references to rich lexical types. Features which ask about the word's Brown cluster, +its typical part-of-speech tag, how it's usually cased etc require no extra effort: >>> from spacy.en import EN >>> from spacy.feature_names import * >>> feats = ( SIC, # ID of the original word form - NORM, # ID of the normalized word form + STEM, # ID of the stemmed word form CLUSTER, # ID of the word's Brown cluster IS_TITLE, # Was the word title-cased? POS_TYPE # A cluster ID describing what POS tags the word is usually assigned @@ -113,14 +125,6 @@ all to the special tokenization rules. spaCy's tokenizer is also incredibly efficient: -+--------+---------------+--------------+ -| System | Tokens/second | Speed Factor | -+--------+---------------+--------------+ -| NLTK | 89 000 | 1.00 | -+--------+---------------+--------------+ -| spaCy | 3 093 000 | 38.30 | -+--------+---------------+--------------+ - spaCy can create an inverted index of the 1.8 billion word Gigaword corpus, in under half an hour --- on a Macbook Air. See the `inverted index tutorial`_. From 6369835306a31b896a28b19f7620506b6d33dd34 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 16:08:17 +1100 Subject: [PATCH 47/56] * Add false positive test for emoticons --- tests/test_emoticons.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_emoticons.py b/tests/test_emoticons.py index 6bb58e661..143be607d 100644 --- a/tests/test_emoticons.py +++ b/tests/test_emoticons.py @@ -27,3 +27,9 @@ def test_tweebo_challenge(): assert tokens[19].string == '")' assert tokens[20].string == ':>' assert tokens[21].string == '....' + + +def test_false_positive(): + text = "example:)" + tokens = EN.tokenize(text) + assert len(tokens) == 3 From 516f0f1e144af000bb2afb19b4745a46c6c3b4cb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 16:08:45 +1100 Subject: [PATCH 48/56] * Remove test for loading ad hoc rules format --- tests/test_rules.py | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 tests/test_rules.py diff --git a/tests/test_rules.py b/tests/test_rules.py deleted file mode 100644 index b19a1c3f1..000000000 --- a/tests/test_rules.py +++ /dev/null @@ -1,11 +0,0 @@ -from spacy import util - - -def test_load_en(): - rules = util.read_tokenization('en') - assert len(rules) != 0 - aint = [rule for rule in rules if rule[0] == "ain't"][0] - chunk, pieces = aint - assert chunk == "ain't" - assert pieces[0] == "are" - assert pieces[1] == "not" From 495e1c7366d30f79ef3332d05c94dbf14abd909d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 16:50:01 +1100 Subject: [PATCH 49/56] * Use fused type in Tokens.push_back, simplifying the use of the cache --- spacy/lang.pxd | 11 +++++++++++ spacy/lang.pyx | 39 +++++++++++++++++++++------------------ spacy/tokens.pxd | 10 +++++++++- spacy/tokens.pyx | 12 ++++++------ 4 files changed, 47 insertions(+), 25 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 0307e12fe..8a6aa5f97 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -13,6 +13,17 @@ from .tagger cimport univ_tag_t from .utf8string cimport StringStore, UniStr +cdef union LexemesOrTokens: + const Lexeme* const* lexemes + TokenC* tokens + + +cdef struct Cached: + LexemesOrTokens data + bint is_lex + int length + + cdef class Lexicon: cpdef public get_lex_props cdef Pool mem diff --git a/spacy/lang.pyx b/spacy/lang.pyx index cdae8644a..044bfb7bc 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -137,21 +137,19 @@ cdef class Language: cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: cdef int i - specials = self._specials.get(key) - if specials != NULL: - i = 0 - while specials[i].lex != NULL: - tokens.push_back(idx, specials[i].lex) - tokens.data[tokens.length - 1].pos = specials[i].pos - tokens.data[tokens.length - 1].morph = specials[i].morph - tokens.data[tokens.length - 1].lemma = specials[i].lemma - tokens.data[tokens.length - 1].sense = specials[i].sense - i += 1 + cdef TokenC* token + cached = self._specials.get(key) + if cached != NULL: + assert not cached.is_lex + for i in range(cached.length): + token = &cached.data.tokens[i] + idx = tokens.push_back(idx, token) return True else: - cached = self._cache.get(key) + cached = self._cache.get(key) if cached != NULL: - tokens.extend(i, cached, 0) + assert cached.is_lex == True + tokens.extend(i, cached.data.lexemes, cached.length) return True else: return False @@ -244,11 +242,14 @@ cdef class Language: for i in range(n): if tokens[i].lex.id == 1: return 0 - lexemes = self.mem.alloc(n + 1, sizeof(Lexeme**)) + cached = self.mem.alloc(1, sizeof(Cached)) + cached.length = n + cached.is_lex = True + lexemes = self.mem.alloc(n, sizeof(Lexeme**)) for i in range(n): lexemes[i] = tokens[i].lex - lexemes[i + 1] = NULL - self._cache.set(key, lexemes) + cached.data.lexemes = lexemes + self._cache.set(key, cached) cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1: cdef unicode string = chars[:length] @@ -287,10 +288,12 @@ cdef class Language: if lemma: tokens[i].lemma = self.lexicon.strings[lemma] set_morph_from_dict(&tokens[i].morph, props) - # Null-terminated array - tokens[i+1].lex = NULL + cached = self.mem.alloc(1, sizeof(Cached)) + cached.length = len(substrings) + cached.is_lex = False + cached.data.tokens = tokens slice_unicode(&string, chunk, 0, len(chunk)) - self._specials.set(string.key, tokens) + self._specials.set(string.key, cached) cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index f3d6011ec..01bec6815 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -30,6 +30,14 @@ cdef struct TokenC: int sense +ctypedef const Lexeme* const_Lexeme_ptr +ctypedef TokenC* TokenC_ptr + +ctypedef fused LexemeOrToken: + const_Lexeme_ptr + TokenC_ptr + + cdef class Tokens: cdef Pool mem cdef StringStore _string_store @@ -40,7 +48,7 @@ cdef class Tokens: cdef int max_length cdef int extend(self, int i, const Lexeme* const* lexemes, int n) except -1 - cdef int push_back(self, int i, const Lexeme* lexeme) except -1 + cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 cpdef int set_tag(self, int i, int tag_type, int tag) except -1 cpdef np.ndarray[long, ndim=2] get_array(self, list features) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 004d0578c..4075e64d7 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -60,16 +60,16 @@ cdef class Tokens: def __len__(self): return self.length - cdef int push_back(self, int idx, const Lexeme* lexeme) except -1: + cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: if self.length == self.max_length: self._realloc(self.length * 2) cdef TokenC* t = &self.data[self.length] - t.lex = lexeme - t.idx = idx - t.pos = 0 - t.sense = 0 + if LexemeOrToken is TokenC_ptr: + t[0] = lex_or_tok[0] + else: + t.lex = lex_or_tok self.length += 1 - return idx + lexeme.length + return idx + t.lex.length cdef int extend(self, int idx, const Lexeme* const* lexemes, int n) except -1: cdef int i From accdbe989b7110cea692ee4e5673b6739a65010a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 17:09:23 +1100 Subject: [PATCH 50/56] * Remove Tokens.extend method --- spacy/lang.pyx | 30 ++++++++++++++---------------- spacy/tokens.pxd | 1 - spacy/tokens.pyx | 14 -------------- 3 files changed, 14 insertions(+), 31 deletions(-) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 044bfb7bc..14a83522b 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -87,7 +87,7 @@ cdef class Language: cdef int idx = 0 for i, py_string in enumerate(strings): slice_unicode(&string_struct, py_string, 0, len(py_string)) - tokens.push_back(idx, self.lexicon.get(tokens.mem, &string_struct)) + tokens.push_back(idx, self.lexicon.get(tokens.mem, &string_struct)) idx += len(py_string) + 1 return tokens @@ -136,23 +136,19 @@ cdef class Language: return tokens cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: - cdef int i - cdef TokenC* token cached = self._specials.get(key) - if cached != NULL: - assert not cached.is_lex - for i in range(cached.length): - token = &cached.data.tokens[i] - idx = tokens.push_back(idx, token) - return True - else: + if cached == NULL: cached = self._cache.get(key) - if cached != NULL: - assert cached.is_lex == True - tokens.extend(i, cached.data.lexemes, cached.length) - return True - else: + if cached == NULL: return False + cdef int i + if cached.is_lex: + for i in range(cached.length): + idx = tokens.push_back(idx, cached.data.lexemes[i]) + else: + for i in range(cached.length): + idx = tokens.push_back(idx, &cached.data.tokens[i]) + return True cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: cdef vector[Lexeme*] prefixes @@ -215,8 +211,10 @@ cdef class Language: cdef const Lexeme* const* lexemes cdef Lexeme* lexeme cdef UniStr span + cdef int i if prefixes.size(): - idx = tokens.extend(idx, prefixes.data(), prefixes.size()) + for i in range(prefixes.size()): + idx = tokens.push_back(idx, prefixes[0][i]) if string.n != 0: cache_hit = self._try_cache(idx, string.key, tokens) if cache_hit: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 01bec6815..cc9e8a05d 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -47,7 +47,6 @@ cdef class Tokens: cdef int length cdef int max_length - cdef int extend(self, int i, const Lexeme* const* lexemes, int n) except -1 cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 cpdef int set_tag(self, int i, int tag_type, int tag) except -1 diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 4075e64d7..0b94d81d4 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -71,20 +71,6 @@ cdef class Tokens: self.length += 1 return idx + t.lex.length - cdef int extend(self, int idx, const Lexeme* const* lexemes, int n) except -1: - cdef int i - if lexemes == NULL: - return idx - elif n == 0: - i = 0 - while lexemes[i] != NULL: - idx = self.push_back(idx, lexemes[i]) - i += 1 - else: - for i in range(n): - idx = self.push_back(idx, lexemes[i]) - return idx - cpdef int set_tag(self, int i, int tag_type, int tag) except -1: self.data[i].pos = tag From b962fe73d7401f9255d4bc72acab9aa3e49b586f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 19:04:27 +1100 Subject: [PATCH 51/56] * Make suffixes file use full-power regex, so that we can handle periods properly --- data/en/suffix | 24 +++++++++++++----------- spacy/util.py | 5 +---- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/data/en/suffix b/data/en/suffix index a6ff32849..8ba48296d 100644 --- a/data/en/suffix +++ b/data/en/suffix @@ -1,13 +1,13 @@ , -" -) -] -} -* -! -? +\" +\) +\] +\} +\* +\! +\? % -$ +\$ > : ; @@ -16,6 +16,8 @@ $ '' 's 'S -.. -... -.... +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9])\. +(?<=[0-9])km diff --git a/spacy/util.py b/spacy/util.py index ff03760a5..1c25aeaf2 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -31,10 +31,7 @@ def read_prefix(data_dir): def read_suffix(data_dir): with utf8open(path.join(data_dir, 'suffix')) as file_: entries = file_.read().split('\n') - expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()]) - # TODO: Fix this hack! - expression += r'|(?<=[a-z0-9])\.$' - expression += r'|(?<=[0-9])km$' + expression = '|'.join([piece + '$' for piece in entries if piece.strip()]) return expression From 6b34a2f34ba4cc61654404f30b84d554fbc2ee10 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 21:16:17 +1100 Subject: [PATCH 52/56] * Move morphological analysis into its own module, morphology.pyx --- spacy/en.pyx | 12 +++---- spacy/lang.pxd | 9 ++--- spacy/lang.pyx | 32 ++--------------- spacy/morphology.pxd | 42 +++++++++++++++++++++++ spacy/morphology.pyx | 81 ++++++++++++++++++++++++++++++++++++++++++++ spacy/tagger.pxd | 26 -------------- spacy/tagger.pyx | 30 ---------------- 7 files changed, 135 insertions(+), 97 deletions(-) create mode 100644 spacy/morphology.pxd create mode 100644 spacy/morphology.pyx diff --git a/spacy/en.pyx b/spacy/en.pyx index 0136818f2..9cd2546cb 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -35,8 +35,8 @@ from __future__ import unicode_literals cimport lang from .typedefs cimport flags_t import orth -from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB -from .tagger cimport X, PUNCT, EOL +from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB +from .morphology cimport X, PUNCT, EOL from .tokens cimport Morphology @@ -154,8 +154,8 @@ cdef class English(Language): for i in range(tokens.length): fill_pos_context(context, i, t) t[i].pos = self.pos_tagger.predict(context) - _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) - t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) + if self.morphologizer: + self.morphologizer.set_morph(i, t) def train_pos(self, Tokens tokens, golds): cdef int i @@ -165,8 +165,8 @@ cdef class English(Language): for i in range(tokens.length): fill_pos_context(context, i, t) t[i].pos = self.pos_tagger.predict(context, [golds[i]]) - _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph) - t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex) + if self.morphologizer: + self.morphologizer.set_morph(i, t) c += t[i].pos == golds[i] return c diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 8a6aa5f97..20986f134 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -2,15 +2,15 @@ from libcpp.vector cimport vector from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER -from preshed.maps cimport PreshMap, PreshMapArray +from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t from .tokens cimport Tokens, TokenC from .lexeme cimport Lexeme from .tagger cimport Tagger -from .tagger cimport univ_tag_t from .utf8string cimport StringStore, UniStr +from .morphology cimport Morphologizer cdef union LexemesOrTokens: @@ -40,17 +40,14 @@ cdef class Language: cdef readonly unicode name cdef PreshMap _cache cdef PreshMap _specials - cdef PreshMapArray _lemmas cpdef readonly Lexicon lexicon cpdef readonly Tagger pos_tagger - cpdef readonly object lemmatizer + cpdef readonly Morphologizer morphologizer cdef object _prefix_re cdef object _suffix_re cdef object _infix_re - cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 - cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokenize(self, unicode text) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 14a83522b..6c018b2ce 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -14,7 +14,6 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 from preshed.maps cimport PreshMap -from .lemmatizer import Lemmatizer from .lexeme cimport Lexeme from .lexeme cimport EMPTY_LEXEME @@ -26,8 +25,6 @@ from .utf8string cimport slice_unicode from . import util from .util import read_lang_data from .tokens import Tokens - -from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS from .tokens cimport Morphology @@ -43,39 +40,16 @@ cdef class Language: self._infix_re = re.compile(infix) self.lexicon = Lexicon(self.get_props) self._load_special_tokenization(rules) - self._lemmas = PreshMapArray(N_UNIV_TAGS) self.pos_tagger = None - self.lemmatizer = None + self.morphologizer = None def load(self): - self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet')) self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes')) self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings')) if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) - - cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: - if self.lemmatizer is None: - return lex.sic - if pos != NOUN and pos != VERB and pos != ADJ: - return lex.sic - cdef int lemma = self._lemmas.get(pos, lex.sic) - if lemma != 0: - return lemma - cdef bytes py_string = self.lexicon.strings[lex.sic] - cdef set lemma_strings - cdef bytes lemma_string - if pos == NOUN: - lemma_strings = self.lemmatizer.noun(py_string) - elif pos == VERB: - lemma_strings = self.lemmatizer.verb(py_string) - else: - assert pos == ADJ - lemma_strings = self.lemmatizer.adj(py_string) - lemma_string = sorted(lemma_strings)[0] - lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i - self._lemmas.set(pos, lex.sic, lemma) - return lemma + self.morphologizer = Morphologizer(self.lexicon.strings, + path.join(util.DATA_DIR, self.name)) cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd new file mode 100644 index 000000000..084cbbbe6 --- /dev/null +++ b/spacy/morphology.pxd @@ -0,0 +1,42 @@ +from .tokens cimport TokenC, Morphology +from .lexeme cimport Lexeme +from .utf8string cimport StringStore + +from preshed.maps cimport PreshMapArray +from cymem.cymem cimport Pool + +# Google universal tag set +cpdef enum univ_tag_t: + NO_TAG + ADJ + ADV + ADP + CONJ + DET + NOUN + NUM + PRON + PRT + VERB + X + PUNCT + EOL + N_UNIV_TAGS + + +cdef struct PosTag: + Morphology morph + int id + univ_tag_t pos + + +cdef class Morphologizer: + cdef Pool mem + cdef StringStore strings + cdef object lemmatizer + cdef PosTag* tags + + cdef PreshMapArray _morph + cdef PreshMapArray _lemmas + cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 + cdef int set_morph(self, const int i, TokenC* tokens) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx new file mode 100644 index 000000000..63c5ff827 --- /dev/null +++ b/spacy/morphology.pyx @@ -0,0 +1,81 @@ +from os import path +import json + +from .lemmatizer import Lemmatizer + + +UNIV_TAGS = { + 'NULL': NO_TAG, + 'ADJ': ADJ, + 'ADV': ADV, + 'ADP': ADP, + 'CONJ': CONJ, + 'DET': DET, + 'NOUN': NOUN, + 'NUM': NUM, + 'PRON': PRON, + 'PRT': PRT, + 'VERB': VERB, + 'X': X, + '.': PUNCT, + 'EOL': EOL +} + + +cdef class Morphologizer: + """Given a POS tag and a Lexeme, find its lemma and morphological analysis. + """ + def __init__(self, StringStore strings, data_dir): + self.mem = Pool() + self.strings = strings + cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) + tag_map = cfg['tag_map'] + tag_names = cfg['tag_names'] + self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet')) + self._lemmas = PreshMapArray(N_UNIV_TAGS) + self._morph = PreshMapArray(len(tag_names)) + self.tags = self.mem.alloc(len(tag_names), sizeof(PosTag)) + for i, tag in enumerate(tag_names): + pos, props = tag_map[tag] + self.tags[i].id = i + self.tags[i].pos = pos + self.tags[i].morph.number = props.get('number', 0) + self.tags[i].morph.tenspect = props.get('tenspect', 0) + self.tags[i].morph.mood = props.get('mood', 0) + self.tags[i].morph.gender = props.get('gender', 0) + self.tags[i].morph.person = props.get('person', 0) + self.tags[i].morph.case = props.get('case', 0) + self.tags[i].morph.misc = props.get('misc', 0) + + cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: + if self.lemmatizer is None: + return lex.sic + if pos != NOUN and pos != VERB and pos != ADJ: + return lex.sic + cdef int lemma = self._lemmas.get(pos, lex.sic) + if lemma != 0: + return lemma + cdef bytes py_string = self.strings[lex.sic] + cdef set lemma_strings + cdef bytes lemma_string + if pos == NOUN: + lemma_strings = self.lemmatizer.noun(py_string) + elif pos == VERB: + lemma_strings = self.lemmatizer.verb(py_string) + else: + assert pos == ADJ + lemma_strings = self.lemmatizer.adj(py_string) + lemma_string = sorted(lemma_strings)[0] + lemma = self.strings.intern(lemma_string, len(lemma_string)).i + self._lemmas.set(pos, lex.sic, lemma) + return lemma + + cdef int set_morph(self, const int i, TokenC* tokens) except -1: + cdef const PosTag* tag = &self.tags[tokens[i].pos] + tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex) + morph = self._morph.get(tag.id, tokens[i].lemma) + if morph is NULL: + self._morph.set(tag.id, tokens[i].lemma, &tag.morph) + tokens[i].morph = tag.morph + else: + tokens[i].morph = morph[0] diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 11880bf13..9abe25209 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -12,31 +12,6 @@ from .typedefs cimport hash_t from .tokens cimport Tokens, Morphology -# Google universal tag set -cdef enum univ_tag_t: - NO_TAG - ADJ - ADV - ADP - CONJ - DET - NOUN - NUM - PRON - PRT - VERB - X - PUNCT - EOL - N_UNIV_TAGS - - -cdef struct PosTag: - Morphology morph - int id - univ_tag_t pos - - cdef class Tagger: cdef class_t predict(self, const atom_t* context, object golds=*) except * @@ -45,5 +20,4 @@ cdef class Tagger: cpdef readonly LinearModel model cpdef readonly list tag_names - cdef PosTag* tags cdef dict tagdict diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index db7974d91..a1e51c5b5 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -34,23 +34,10 @@ cdef class Tagger: self.mem = Pool() cfg = json.load(open(path.join(model_dir, 'config.json'))) templates = cfg['templates'] - tag_map = cfg['tag_map'] univ_counts = {} cdef unicode tag cdef unicode univ_tag self.tag_names = cfg['tag_names'] - self.tags = self.mem.alloc(len(self.tag_names), sizeof(PosTag)) - for i, tag in enumerate(self.tag_names): - pos, props = tag_map[tag] - self.tags[i].id = i - self.tags[i].pos = pos - self.tags[i].morph.number = props.get('number', 0) - self.tags[i].morph.tenspect = props.get('tenspect', 0) - self.tags[i].morph.mood = props.get('mood', 0) - self.tags[i].morph.gender = props.get('gender', 0) - self.tags[i].morph.person = props.get('person', 0) - self.tags[i].morph.case = props.get('case', 0) - self.tags[i].morph.misc = props.get('misc', 0) self.tagdict = _make_tag_dict(cfg['tag_counts']) self.extractor = Extractor(templates) self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2) @@ -85,23 +72,6 @@ cdef class Tagger: return tag_id -UNIV_TAGS = { - 'NULL': NO_TAG, - 'ADJ': ADJ, - 'ADV': ADV, - 'ADP': ADP, - 'CONJ': CONJ, - 'DET': DET, - 'NOUN': NOUN, - 'NUM': NUM, - 'PRON': PRON, - 'PRT': PRT, - 'VERB': VERB, - 'X': X, - '.': PUNCT, - 'EOL': EOL -} - def _make_tag_dict(counts): freq_thresh = 50 From 42973c4b370b1eb68eef45403356e267e25b5513 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Dec 2014 01:02:04 +1100 Subject: [PATCH 53/56] * Improve efficiency of tagger, and improve morphological processing --- spacy/en.pxd | 18 ------------ spacy/en.pyx | 44 +++++++++++++++------------- spacy/morphology.pxd | 4 +-- spacy/morphology.pyx | 70 +++++++++++++++++++++++++++++++++----------- spacy/orth.py | 1 + spacy/tagger.pxd | 2 +- spacy/tagger.pyx | 5 ++-- 7 files changed, 83 insertions(+), 61 deletions(-) diff --git a/spacy/en.pxd b/spacy/en.pxd index cee754d9c..4ac8a126d 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -125,23 +125,5 @@ cpdef enum: N_CONTEXT_FIELDS -cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil: - _fill_from_token(&context[P2_sic], &tokens[i-2]) - _fill_from_token(&context[P1_sic], &tokens[i-1]) - _fill_from_token(&context[W_sic], &tokens[i]) - _fill_from_token(&context[N1_sic], &tokens[i+1]) - _fill_from_token(&context[N2_sic], &tokens[i+2]) - - -cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: - context[0] = t.lex.sic - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.pos - context[6] = t.sense - - cdef class English(Language): pass diff --git a/spacy/en.pyx b/spacy/en.pyx index 9cd2546cb..10773e0e2 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -151,10 +151,14 @@ cdef class English(Language): cdef int i cdef atom_t[N_CONTEXT_FIELDS] context cdef TokenC* t = tokens.data + assert self.morphologizer is not None + cdef dict tagdict = self.pos_tagger.tagdict for i in range(tokens.length): - fill_pos_context(context, i, t) - t[i].pos = self.pos_tagger.predict(context) - if self.morphologizer: + if t[i].lex.sic in tagdict: + t[i].pos = tagdict[t[i].lex.sic] + else: + fill_pos_context(context, i, t) + t[i].pos = self.pos_tagger.predict(context) self.morphologizer.set_morph(i, t) def train_pos(self, Tokens tokens, golds): @@ -165,27 +169,27 @@ cdef class English(Language): for i in range(tokens.length): fill_pos_context(context, i, t) t[i].pos = self.pos_tagger.predict(context, [golds[i]]) - if self.morphologizer: - self.morphologizer.set_morph(i, t) + self.morphologizer.set_morph(i, t) c += t[i].pos == golds[i] return c -cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1: - if tok_morph.number == 0: - tok_morph.number = pos_morph.number - if tok_morph.tenspect == 0: - tok_morph.tenspect = pos_morph.tenspect - if tok_morph.mood == 0: - tok_morph.mood = pos_morph.mood - if tok_morph.gender == 0: - tok_morph.gender = pos_morph.gender - if tok_morph.person == 0: - tok_morph.person = pos_morph.person - if tok_morph.case == 0: - tok_morph.case = pos_morph.case - if tok_morph.misc == 0: - tok_morph.misc = pos_morph.misc +cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1: + _fill_from_token(&context[P2_sic], &tokens[i-2]) + _fill_from_token(&context[P1_sic], &tokens[i-1]) + _fill_from_token(&context[W_sic], &tokens[i]) + _fill_from_token(&context[N1_sic], &tokens[i+1]) + _fill_from_token(&context[N2_sic], &tokens[i+2]) + + +cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: + context[0] = t.lex.sic + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.pos + context[6] = t.sense EN = English('en') diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 084cbbbe6..31cb08855 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -35,8 +35,8 @@ cdef class Morphologizer: cdef StringStore strings cdef object lemmatizer cdef PosTag* tags + cdef readonly list tag_names - cdef PreshMapArray _morph - cdef PreshMapArray _lemmas + cdef PreshMapArray _cache cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 cdef int set_morph(self, const int i, TokenC* tokens) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 63c5ff827..b21a3ced4 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,8 +1,10 @@ +# cython: profile=True +# cython: embedsignature=True from os import path import json from .lemmatizer import Lemmatizer - +from .typedefs cimport id_t UNIV_TAGS = { 'NULL': NO_TAG, @@ -22,6 +24,11 @@ UNIV_TAGS = { } +cdef struct _Cached: + Morphology morph + int lemma + + cdef class Morphologizer: """Given a POS tag and a Lexeme, find its lemma and morphological analysis. """ @@ -30,12 +37,11 @@ cdef class Morphologizer: self.strings = strings cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) tag_map = cfg['tag_map'] - tag_names = cfg['tag_names'] + self.tag_names = cfg['tag_names'] self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet')) - self._lemmas = PreshMapArray(N_UNIV_TAGS) - self._morph = PreshMapArray(len(tag_names)) - self.tags = self.mem.alloc(len(tag_names), sizeof(PosTag)) - for i, tag in enumerate(tag_names): + self._cache = PreshMapArray(len(self.tag_names)) + self.tags = self.mem.alloc(len(self.tag_names), sizeof(PosTag)) + for i, tag in enumerate(self.tag_names): pos, props = tag_map[tag] self.tags[i].id = i self.tags[i].pos = pos @@ -46,15 +52,15 @@ cdef class Morphologizer: self.tags[i].morph.person = props.get('person', 0) self.tags[i].morph.case = props.get('case', 0) self.tags[i].morph.misc = props.get('misc', 0) + if path.exists(path.join(data_dir, 'morph.json')): + with open(path.join(data_dir, 'morph.json')) as file_: + self.load_exceptions(json.loads(file_)) cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: if self.lemmatizer is None: return lex.sic if pos != NOUN and pos != VERB and pos != ADJ: return lex.sic - cdef int lemma = self._lemmas.get(pos, lex.sic) - if lemma != 0: - return lemma cdef bytes py_string = self.strings[lex.sic] cdef set lemma_strings cdef bytes lemma_string @@ -67,15 +73,45 @@ cdef class Morphologizer: lemma_strings = self.lemmatizer.adj(py_string) lemma_string = sorted(lemma_strings)[0] lemma = self.strings.intern(lemma_string, len(lemma_string)).i - self._lemmas.set(pos, lex.sic, lemma) return lemma cdef int set_morph(self, const int i, TokenC* tokens) except -1: cdef const PosTag* tag = &self.tags[tokens[i].pos] - tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex) - morph = self._morph.get(tag.id, tokens[i].lemma) - if morph is NULL: - self._morph.set(tag.id, tokens[i].lemma, &tag.morph) - tokens[i].morph = tag.morph - else: - tokens[i].morph = morph[0] + cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic) + if cached is NULL: + cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) + cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) + cached.morph = tag.morph + self._cache.set(tag.id, tokens[i].lex.sic, cached) + + tokens[i].lemma = cached.lemma + tokens[i].morph = cached.morph + + def load_exceptions(self, dict exc): + cdef unicode pos_str + cdef unicode form_str + cdef unicode lemma_str + cdef dict entries + cdef dict props + cdef int lemma + cdef id_t sic + cdef univ_tag_t pos + for pos_str, entries in exc.items(): + pos = self.tag_names.index(pos_str) + for form_str, props in entries.items(): + lemma_str = props.get('L', form_str) + sic = self.strings[form_str] + cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) + cached.lemma = self.strings[lemma_str] + set_morph_from_dict(&cached.morph, props) + self._cache.set(pos, sic, cached) + + +cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: + morph.number = props.get('number', 0) + morph.tenspect = props.get('tenspect', 0) + morph.mood = props.get('mood', 0) + morph.gender = props.get('gender', 0) + morph.person = props.get('person', 0) + morph.case = props.get('case', 0) + morph.misc = props.get('misc', 0) diff --git a/spacy/orth.py b/spacy/orth.py index 0462d15df..2400b38a6 100644 --- a/spacy/orth.py +++ b/spacy/orth.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import unicodedata from unidecode import unidecode +import re import math diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 9abe25209..a896742ad 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -8,7 +8,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t from preshed.maps cimport PreshMapArray -from .typedefs cimport hash_t +from .typedefs cimport hash_t, id_t from .tokens cimport Tokens, Morphology diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index a1e51c5b5..9890e95e1 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -72,10 +72,9 @@ cdef class Tagger: return tag_id - def _make_tag_dict(counts): - freq_thresh = 50 - ambiguity_thresh = 0.98 + freq_thresh = 20 + ambiguity_thresh = 0.97 tagdict = {} cdef atom_t word cdef atom_t tag From df3be149871da40b5f15c49d8870220f7fc36b5d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Dec 2014 08:08:55 +1100 Subject: [PATCH 54/56] * Add pos_type features to POS tagger --- spacy/en.pxd | 15 ++++++++++----- spacy/en.pyx | 14 ++++++++++---- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/spacy/en.pxd b/spacy/en.pxd index 4ac8a126d..2ca081e47 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -88,7 +88,8 @@ cpdef enum: P2_prefix P2_suffix P2_pos - P2_sense + P2_lemma + P2_pos_type P1_sic P1_cluster @@ -96,7 +97,8 @@ cpdef enum: P1_prefix P1_suffix P1_pos - P1_sense + P1_lemma + P1_pos_type W_sic W_cluster @@ -104,7 +106,8 @@ cpdef enum: W_prefix W_suffix W_pos - W_sense + W_lemma + W_pos_type N1_sic N1_cluster @@ -112,7 +115,8 @@ cpdef enum: N1_prefix N1_suffix N1_pos - N1_sense + N1_lemma + N1_pos_type N2_sic N2_cluster @@ -120,7 +124,8 @@ cpdef enum: N2_prefix N2_suffix N2_pos - N2_sense + N2_lemma + N2_pos_type N_CONTEXT_FIELDS diff --git a/spacy/en.pyx b/spacy/en.pyx index 10773e0e2..3ed0eaaa9 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -98,10 +98,10 @@ POS_TAGS = { POS_TEMPLATES = ( (W_sic,), - (P1_sic,), + (P1_lemma, P1_pos), + (P2_lemma, P2_pos), (N1_sic,), (N2_sic,), - (P2_sic,), (W_suffix,), (W_prefix,), @@ -119,6 +119,11 @@ POS_TEMPLATES = ( (N2_cluster,), (P1_cluster,), (P2_cluster,), + + (W_pos_type,), + (N1_pos_type,), + (N1_pos_type,), + (P1_pos, W_pos_type, N1_pos_type), ) @@ -159,7 +164,7 @@ cdef class English(Language): else: fill_pos_context(context, i, t) t[i].pos = self.pos_tagger.predict(context) - self.morphologizer.set_morph(i, t) + self.morphologizer.set_morph(i, t) def train_pos(self, Tokens tokens, golds): cdef int i @@ -189,7 +194,8 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: context[3] = t.lex.prefix context[4] = t.lex.suffix context[5] = t.pos - context[6] = t.sense + context[6] = t.lemma + context[7] = t.lex.pos_type EN = English('en') From 7831b066106620d3efa2acea3839daefd2035d98 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Dec 2014 08:09:13 +1100 Subject: [PATCH 55/56] * Compile morphology.pyx file --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 6ff1f5d62..827d44fc6 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,8 @@ exts = [ Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes), Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes), Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes), + Extension("spacy.morphology", ["spacy/morphology.pyx"], language="c++", + include_dirs=includes), #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes), From 9959a64f7bd36f6c62aa53f4bf8f30b0d4d81ee0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Dec 2014 08:09:32 +1100 Subject: [PATCH 56/56] * Working morphology and lemmatisation. POS tagging quite fast. --- spacy/lang.pyx | 12 ++++++------ spacy/lexeme.pxd | 1 + spacy/morphology.pxd | 5 ++++- spacy/morphology.pyx | 6 +++--- spacy/tagger.pxd | 2 +- spacy/tokens.pxd | 21 ++++++--------------- spacy/tokens.pyx | 28 ++++++++++++++++------------ spacy/typedefs.pxd | 11 +++++++++++ 8 files changed, 48 insertions(+), 38 deletions(-) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 6c018b2ce..4617c3853 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -53,7 +53,7 @@ cdef class Language: cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) - cdef Tokens tokens = Tokens(self.lexicon.strings, length) + cdef Tokens tokens = Tokens(self, length) if length == 0: return tokens cdef UniStr string_struct @@ -81,7 +81,7 @@ cdef class Language: tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. """ cdef int length = len(string) - cdef Tokens tokens = Tokens(self.lexicon.strings, length) + cdef Tokens tokens = Tokens(self, length) if length == 0: return tokens cdef int i = 0 @@ -110,11 +110,10 @@ cdef class Language: return tokens cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: - cached = self._specials.get(key) + #cached = self._specials.get(key) + cached = self._cache.get(key) if cached == NULL: - cached = self._cache.get(key) - if cached == NULL: - return False + return False cdef int i if cached.is_lex: for i in range(cached.length): @@ -266,6 +265,7 @@ cdef class Language: cached.data.tokens = tokens slice_unicode(&string, chunk, 0, len(chunk)) self._specials.set(string.key, cached) + self._cache.set(string.key, cached) cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index f524188ed..a6f20906b 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -80,6 +80,7 @@ cpdef enum attr_id_t: LENGTH CLUSTER POS_TYPE + LEMMA cdef struct Lexeme: diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 31cb08855..9c5d342e9 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,10 +1,13 @@ -from .tokens cimport TokenC, Morphology + +from .tokens cimport TokenC from .lexeme cimport Lexeme from .utf8string cimport StringStore +from .typedefs cimport id_t, Morphology from preshed.maps cimport PreshMapArray from cymem.cymem cimport Pool + # Google universal tag set cpdef enum univ_tag_t: NO_TAG diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index b21a3ced4..346c778a9 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -52,9 +52,9 @@ cdef class Morphologizer: self.tags[i].morph.person = props.get('person', 0) self.tags[i].morph.case = props.get('case', 0) self.tags[i].morph.misc = props.get('misc', 0) - if path.exists(path.join(data_dir, 'morph.json')): - with open(path.join(data_dir, 'morph.json')) as file_: - self.load_exceptions(json.loads(file_)) + if path.exists(path.join(data_dir, 'morphs.json')): + with open(path.join(data_dir, 'morphs.json')) as file_: + self.load_exceptions(json.load(file_)) cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: if self.lemmatizer is None: diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index a896742ad..33732f987 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -9,7 +9,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t from preshed.maps cimport PreshMapArray from .typedefs cimport hash_t, id_t -from .tokens cimport Tokens, Morphology +from .tokens cimport Tokens cdef class Tagger: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index cc9e8a05d..43aa7b442 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -7,19 +7,10 @@ from thinc.typedefs cimport atom_t from .lexeme cimport Lexeme from .typedefs cimport flags_t -from .utf8string cimport StringStore -from libc.stdint cimport uint8_t, uint16_t +from .typedefs cimport Morphology +from .lang cimport Language -cdef struct Morphology: - uint8_t number - uint8_t tenspect # Tense/aspect/voice - uint8_t mood - uint8_t gender - uint8_t person - uint8_t case - uint8_t misc - cdef struct TokenC: const Lexeme* lex @@ -40,7 +31,8 @@ ctypedef fused LexemeOrToken: cdef class Tokens: cdef Pool mem - cdef StringStore _string_store + cdef Language lang + cdef list tag_names cdef TokenC* data @@ -48,16 +40,15 @@ cdef class Tokens: cdef int max_length cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 - cpdef int set_tag(self, int i, int tag_type, int tag) except -1 cpdef np.ndarray[long, ndim=2] get_array(self, list features) cdef class Token: - cdef StringStore _string_store + cdef public Language lang cdef public int i cdef public int idx - cdef public int pos + cdef int pos cdef int lemma cdef public atom_t id diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 0b94d81d4..617feb269 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -30,8 +30,8 @@ cdef class Tokens: >>> from spacy.en import EN >>> tokens = EN.tokenize('An example sentence.') """ - def __init__(self, StringStore string_store, string_length=0): - self._string_store = string_store + def __init__(self, Language lang, string_length=0): + self.lang = lang if string_length >= 3: size = int(string_length / 3.0) else: @@ -50,7 +50,7 @@ cdef class Tokens: def __getitem__(self, i): bounds_check(i, self.length, PADDING) - return Token(self._string_store, i, self.data[i].idx, self.data[i].pos, + return Token(self.lang, i, self.data[i].idx, self.data[i].pos, self.data[i].lemma, self.data[i].lex[0]) def __iter__(self): @@ -71,9 +71,6 @@ cdef class Tokens: self.length += 1 return idx + t.lex.length - cpdef int set_tag(self, int i, int tag_type, int tag) except -1: - self.data[i].pos = tag - @cython.boundscheck(False) cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids): cdef int i, j @@ -92,7 +89,10 @@ cdef class Tokens: cdef PreshCounter counts = PreshCounter(2 ** 8) for i in range(self.length): - attr = get_attr(self.data[i].lex, attr_id) + if attr_id == LEMMA: + attr = self.data[i].lemma + else: + attr = get_attr(self.data[i].lex, attr_id) counts.inc(attr, 1) return dict(counts) @@ -114,9 +114,9 @@ cdef class Tokens: @cython.freelist(64) cdef class Token: - def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma, - dict lex): - self._string_store = string_store + def __init__(self, Language lang, int i, int idx, + int pos, int lemma, dict lex): + self.lang = lang self.idx = idx self.pos = pos self.i = i @@ -141,12 +141,16 @@ cdef class Token: def __get__(self): if self.sic == 0: return '' - cdef bytes utf8string = self._string_store[self.sic] + cdef bytes utf8string = self.lang.lexicon.strings[self.sic] return utf8string.decode('utf8') property lemma: def __get__(self): if self.lemma == 0: return self.string - cdef bytes utf8string = self._string_store[self.lemma] + cdef bytes utf8string = self.lang.lexicon.strings[self.lemma] return utf8string.decode('utf8') + + property pos: + def __get__(self): + return self.lang.pos_tagger.tag_names[self.pos] diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 893865133..02d327b72 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -1,4 +1,5 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t +from libc.stdint cimport uint8_t ctypedef uint64_t hash_t ctypedef char* utf8_t @@ -7,3 +8,13 @@ ctypedef uint64_t flags_t ctypedef uint32_t id_t ctypedef uint16_t len_t ctypedef uint16_t tag_t + + +cdef struct Morphology: + uint8_t number + uint8_t tenspect # Tense/aspect/voice + uint8_t mood + uint8_t gender + uint8_t person + uint8_t case + uint8_t misc