spaCy/spacy/spacy.pyx

# cython: profile=True
# cython: embedsignature=True
"""Common classes and utilities across languages.

Provides the main implementation for the spacy tokenizer. Specific languages
subclass the Language class, over-writing the tokenization rules as necessary.
Special-case tokenization rules are read from data/<lang>/tokenization .
"""

 
from __future__ import unicode_literals

from libc.stdlib cimport calloc, free
from libcpp.pair cimport pair
from cython.operator cimport dereference as deref

from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport LexID

from . import util
from os import path

TAGS = {}
DIST_FLAGS = {}

cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
        self.chunks = dense_hash_map[StringHash, size_t]()
        self.vocab = dense_hash_map[StringHash, size_t]()
        self.chunks.set_empty_key(0)
        self.vocab.set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))
        self.load_dist_info(util.read_dist_info(name))

    cpdef Tokens tokenize(self, unicode string):
        """Tokenize.

        Split the string into tokens.

        Args:
            string (unicode): The string to split.

        Returns:
            tokens (Tokens): A Tokens object.
        """
        cdef Lexeme** chunk
        cdef Tokens tokens = Tokens(self)
        cdef size_t length = len(string)
        cdef size_t start = 0
        cdef size_t i = 0
        for c in string:
            if _is_whitespace(c):
                if start < i:
                    chunk = self.lookup_chunk(string[start:i])
                    _extend(tokens, chunk)
                start = i + 1
            i += 1
        if start < i:
            chunk = self.lookup_chunk(string[start:])
            _extend(tokens, chunk)
        return tokens

    cdef Lexeme* lookup(self, unicode string) except NULL:
        assert len(string) != 0
        cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]
        if word == NULL:
            word = self.new_lexeme(string)
        return word

    cdef Lexeme** lookup_chunk(self, unicode string) except NULL:
        cdef StringHash h = hash(string)
        cdef Lexeme** chunk = <Lexeme**>self.chunks[h]
        cdef int split
        if chunk == NULL:
            chunk = self.new_chunk(string, self.find_substrings(string))
        return chunk

    cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:
        cdef Lexeme** chunk = <Lexeme**>calloc(len(substrings) + 1, sizeof(Lexeme*))
        for i, substring in enumerate(substrings):
            chunk[i] = self.lookup(substring)
        chunk[i + 1] = NULL
        self.chunks[hash(string)] = <size_t>chunk
        return chunk

    cdef Lexeme* new_lexeme(self, unicode string) except NULL:
        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
        cdef bytes byte_string = string.encode('utf8')
        word.string = <char*>byte_string
        word.length = len(byte_string)
        self.set_orth(string, word)

        word.lex = hash(string)
        self.bacov[word.lex] = string
        self.vocab[word.lex] = <LexID>word
        return word

    cpdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
        return self.bacov[hash_value]

    cpdef list find_substrings(self, unicode chunk):
        """Find how to split a chunk into substrings.

        This method calls find_split repeatedly. Most languages will want to
        override find_split, but it may be useful to override this instead.

        Args:
            chunk (unicode): The string to be split, e.g. u"Mike's!"

        Returns:
            substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].
        """
        substrings = []
        while chunk:
            split = self.find_split(chunk)
            if split == 0:
                substrings.append(chunk)
                break
            substrings.append(chunk[:split])
            chunk = chunk[split:]
        return substrings

    cdef int find_split(self, unicode word):
        return len(word)

    cdef int set_orth(self, unicode string, Lexeme* word):
        pass

    def load_tokenization(self, token_rules):
        '''Load special-case tokenization rules.

        Loads special-case tokenization rules into the Language.chunk cache,
        read from data/<lang>/tokenization . The special cases are loaded before
        any language data is tokenized, giving these priority.  For instance,
        the English tokenization rules map "ain't" to ["are", "not"].

        Args:
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                a string and tokens is a list of strings.
        '''
        for chunk, tokens in token_rules:
            self.new_chunk(chunk, tokens)

    def load_dist_info(self, dist_info):
        '''Load distributional information for the known lexemes of the language.

        The distributional information is read from data/<lang>/dist_info.json .
        It contains information like the (smoothed) unigram log probability of
        the word, how often the word is found upper-cased, how often the word
        is found title-cased, etc.
        '''
        cdef unicode string
        cdef dict word_dist
        cdef Lexeme* w
        for string, word_dist in dist_info.items():
            w = self.lookup(string)
            w.prob = word_dist.prob
            w.cluster = word_dist.cluster
            for flag in word_dist.flags:
                w.dist_flags |= DIST_FLAGS[flag]
            for tag in word_dist.tagdict:
                w.possible_tags |= TAGS[tag]


cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
    if c == ' ':
        return True
    elif c == '\n':
        return True
    elif c == '\t':
        return True
    else:
        return False


cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:
    cdef size_t i = 0
    while chunk[i] != NULL:
        tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])
        tokens.length += 1
        i += 1
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ 2014-07-07 09:36:43 +04:00			`# cython: profile=True`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`# cython: embedsignature=True`
			`"""Common classes and utilities across languages.`

			`Provides the main implementation for the spacy tokenizer. Specific languages`
			`subclass the Language class, over-writing the tokenization rules as necessary.`
			`Special-case tokenization rules are read from data/<lang>/tokenization .`
			`"""`


* Initial commit. Tests passing for punctuation handling. Need contractions, file transport, tokenize function, etc. 2014-07-05 22:51:42 +04:00			`from __future__ import unicode_literals`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 06:21:06 +04:00
* Switch to dynamically allocating array, based on the document length 2014-07-07 10:05:29 +04:00			`from libc.stdlib cimport calloc, free`
* Fix memory leak in tokenizer, caused by having a fixed vocab. 2014-07-31 21:19:38 +04:00			`from libcpp.pair cimport pair`
			`from cython.operator cimport dereference as deref`
* Switch to dynamically allocating array, based on the document length 2014-07-07 10:05:29 +04:00
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00			`from spacy.lexeme cimport Lexeme`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`from spacy.lexeme cimport LexID`
* Reorganized, moving language-independent stuff to spacy. The functions in spacy ask for the dictionaries and split function on input, but the language-specific modules are curried versions that use the globals 2014-07-07 06:21:06 +04:00
			`from . import util`
* Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ 2014-07-07 09:36:43 +04:00			`from os import path`
* Progress to getting WordTree working. Tests pass, but so far it's slower. 2014-08-16 21:59:38 +04:00
* Working refactor, with updated data model for Lexemes 2014-08-19 06:21:20 +04:00			`TAGS = {}`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`DIST_FLAGS = {}`
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00			`cdef class Language:`
			`def __cinit__(self, name):`
			`self.name = name`
			`self.bacov = {}`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`self.chunks = dense_hash_map[StringHash, size_t]()`
			`self.vocab = dense_hash_map[StringHash, size_t]()`
			`self.chunks.set_empty_key(0)`
			`self.vocab.set_empty_key(0)`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00			`self.load_tokenization(util.read_tokenization(name))`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`self.load_dist_info(util.read_dist_info(name))`

			`cpdef Tokens tokenize(self, unicode string):`
			`"""Tokenize.`

			`Split the string into tokens.`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`Args:`
			`string (unicode): The string to split.`

			`Returns:`
			`tokens (Tokens): A Tokens object.`
			`"""`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`cdef Lexeme** chunk`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00			`cdef Tokens tokens = Tokens(self)`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`cdef size_t length = len(string)`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`cdef size_t start = 0`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`cdef size_t i = 0`
			`for c in string:`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`if _is_whitespace(c):`
			`if start < i:`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`chunk = self.lookup_chunk(string[start:i])`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`_extend(tokens, chunk)`
			`start = i + 1`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`i += 1`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`if start < i:`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`chunk = self.lookup_chunk(string[start:])`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`_extend(tokens, chunk)`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00			`return tokens`

* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`cdef Lexeme* lookup(self, unicode string) except NULL:`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`assert len(string) != 0`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`cdef Lexeme* word = <Lexeme*>self.vocab[hash(string)]`
			`if word == NULL:`
			`word = self.new_lexeme(string)`
			`return word`
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`cdef Lexeme** lookup_chunk(self, unicode string) except NULL:`
			`cdef StringHash h = hash(string)`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`cdef Lexeme chunk = <Lexeme>self.chunks[h]`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`cdef int split`
			`if chunk == NULL:`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`chunk = self.new_chunk(string, self.find_substrings(string))`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`return chunk`

			`cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL:`
			`cdef Lexeme chunk = <Lexeme>calloc(len(substrings) + 1, sizeof(Lexeme*))`
			`for i, substring in enumerate(substrings):`
			`chunk[i] = self.lookup(substring)`
			`chunk[i + 1] = NULL`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`self.chunks[hash(string)] = <size_t>chunk`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`return chunk`

			`cdef Lexeme* new_lexeme(self, unicode string) except NULL:`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00			`cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))`
* Working refactor, with updated data model for Lexemes 2014-08-19 06:21:20 +04:00			`cdef bytes byte_string = string.encode('utf8')`
			`word.string = <char*>byte_string`
			`word.length = len(byte_string)`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`self.set_orth(string, word)`

* Remove dependence on murmurhash 2014-08-16 19:37:09 +04:00			`word.lex = hash(string)`
* Restore string saving to spacy 2014-08-16 18:09:24 +04:00			`self.bacov[word.lex] = string`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`self.vocab[word.lex] = <LexID>word`
* WordTree in use. Need to reform the way chunks are handled. Should be properly one Lexeme per word, with split points being the things that are cached. 2014-08-16 22:10:22 +04:00			`return word`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`cpdef unicode unhash(self, StringHash hash_value):`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00			`'''Fetch a string from the reverse index, given its hash value.'''`
* Restore unicode, work on improving string storage. 2014-08-16 16:35:34 +04:00			`return self.bacov[hash_value]`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`cpdef list find_substrings(self, unicode chunk):`
			`"""Find how to split a chunk into substrings.`

			`This method calls find_split repeatedly. Most languages will want to`
			`override find_split, but it may be useful to override this instead.`

			`Args:`
			`chunk (unicode): The string to be split, e.g. u"Mike's!"`

			`Returns:`
			`substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"].`
			`"""`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`substrings = []`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`while chunk:`
			`split = self.find_split(chunk)`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`if split == 0:`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`substrings.append(chunk)`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`break`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`substrings.append(chunk[:split])`
			`chunk = chunk[split:]`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`return substrings`

			`cdef int find_split(self, unicode word):`
			`return len(word)`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`cdef int set_orth(self, unicode string, Lexeme* word):`
			`pass`

			`def load_tokenization(self, token_rules):`
			`'''Load special-case tokenization rules.`

			`Loads special-case tokenization rules into the Language.chunk cache,`
			`read from data/<lang>/tokenization . The special cases are loaded before`
			`any language data is tokenized, giving these priority. For instance,`
			`the English tokenization rules map "ain't" to ["are", "not"].`

			`Args:`
			`token_rules (list): A list of (chunk, tokens) pairs, where chunk is`
			`a string and tokens is a list of strings.`
			`'''`
* Refactor spacy so that chunks return arrays of lexemes, so that there is properly one lexeme per word. 2014-08-18 21:14:00 +04:00			`for chunk, tokens in token_rules:`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`self.new_chunk(chunk, tokens)`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00
* Reforming data model for lexemes 2014-08-19 04:40:37 +04:00			`def load_dist_info(self, dist_info):`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`'''Load distributional information for the known lexemes of the language.`

			`The distributional information is read from data/<lang>/dist_info.json .`
			`It contains information like the (smoothed) unigram log probability of`
			`the word, how often the word is found upper-cased, how often the word`
			`is found title-cased, etc.`
			`'''`
* Reforming data model for lexemes 2014-08-19 04:40:37 +04:00			`cdef unicode string`
			`cdef dict word_dist`
* Refactoring tokenizer 2014-08-16 05:22:03 +04:00			`cdef Lexeme* w`
* Reforming data model for lexemes 2014-08-19 04:40:37 +04:00			`for string, word_dist in dist_info.items():`
			`w = self.lookup(string)`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`w.prob = word_dist.prob`
			`w.cluster = word_dist.cluster`
* Reforming data model for lexemes 2014-08-19 04:40:37 +04:00			`for flag in word_dist.flags:`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`w.dist_flags \|= DIST_FLAGS[flag]`
* Reforming data model for lexemes 2014-08-19 04:40:37 +04:00			`for tag in word_dist.tagdict:`
* Broken version being refactored for docs 2014-08-20 15:39:39 +04:00			`w.possible_tags \|= TAGS[tag]`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00

* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`cdef inline bint _is_whitespace(Py_UNICODE c) nogil:`
			`if c == ' ':`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`return True`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`elif c == '\n':`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`return True`
* Working version that uses arrays for chunks, which should be more memory efficient 2014-08-18 22:23:54 +04:00			`elif c == '\t':`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`return True`
			`else:`
			`return False`


* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`cdef inline int _extend(Tokens tokens, Lexeme** chunk) nogil:`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`cdef size_t i = 0`
			`while chunk[i] != NULL:`
* Roll back to using unicode, and never Py_UNICODE. No dependence on murmurhash either. 2014-08-18 22:48:48 +04:00			`tokens.vctr[0].push_back(<Lexeme_addr>chunk[i])`
			`tokens.length += 1`
* Working version, adding improvements 2014-08-18 21:59:59 +04:00			`i += 1`