diff --git a/data/en/prefix b/data/en/prefix index 64a3f1f2f..cb9bb4d7b 100644 --- a/data/en/prefix +++ b/data/en/prefix @@ -11,3 +11,8 @@ $ ' `` ` +# +US$ +C$ +A$ +a- diff --git a/data/en/suffix b/data/en/suffix index 77400d0fd..8ba48296d 100644 --- a/data/en/suffix +++ b/data/en/suffix @@ -1,13 +1,13 @@ , -" -) -] -} -* -! -? +\" +\) +\] +\} +\* +\! +\? % -$ +\$ > : ; @@ -16,7 +16,8 @@ $ '' 's 'S -. -.. -... -.... +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9])\. +(?<=[0-9])km diff --git a/data/en/tokenization b/data/en/tokenization index 6bf0d738b..382b7e383 100644 --- a/data/en/tokenization +++ b/data/en/tokenization @@ -4,101 +4,9 @@ #*---* --- #*'s 's -'s 's -'S 'S -ain't are not -aren't are not -can't can not -cannot can not -could've could have -couldn't could not -couldn't've could not have -didn't did not -doesn't does not -don't do not -hadn't had not -hadn't've had not have -hasn't has not -haven't have not -he'd he would -he'd've he would have -he'll he will -he's he 's -how'd he would -how'll he will -how's how 's -I'd I would -I'd've I would have -I'll I will -I'm I am -I'ma I will -I've I have -isn't is not -it'd it would -it'd've it would have -it'll it will -it's it 's -let's let 's -mightn't might not -mightn't've might not have -might've might have -mustn't must not -must've must have -needn't need not -not've not have -shan't shall not -she'd she would -she'd've she would have -she'll she will -she's she 's -should've should have -shouldn't should not -shouldn't've should not have -that's that 's -there'd there would -there'd've there would have -there's there is -they'd there would -they'd've they would have -they'll they will -they're they are -they've they have -wasn't was not -we'd we would -we'd've we would have -we'll we will -we're we are -we've we have -weren't were not -what'll what will -what're what are -what's what 's -what've what have -when's when 's -where'd where would -where's where 's -where've where have -who'd who would -who'll who will -who're who are -who's who 's -who've who have -why'll who will -why're why are -why's why 's -won't will not -would've would have -wouldn't would not -wouldn't've would not have -you'd you would -you'd've you would have -you'll you will -you're you are -you've you have -'em them -'ol old 10km 10 km U.S. U.S. +U.K. U.K. non-U.S. non-U.S. U.N. U.N. Co. Co. @@ -115,7 +23,12 @@ A.G. A.G. Rep. Rep. Ms. Ms. Mr. Mr. +Mrs. Mrs. a.m. a.m. +Sen. Sen. +INC. INC. +CO. CO. +COS. COS. p.m. p.m. Nos. Nos. a.k.a. a.k.a. @@ -127,6 +40,7 @@ E. E. F. F. G. G. H. H. +I. I. J. J. K. K. L. L. @@ -205,6 +119,9 @@ Wash. Wash. W.Va. W.Va. Wis. Wis. Wyo. Wyo. +L.A. L.A. +R.H. R.H. +Gov. Gov. '' '' :) :) <3 <3 @@ -262,3 +179,19 @@ V_V V_V o.O o.O ") ") .... .... +a- a - +Messrs. Messrs. +No. No. +vs. vs. +Gen. Gen. +Cos. Cos. +L.J. L.J. +D.T. D.T. +Prof. Prof. +Bros. Bros. +J.C. J.C. +Neb. Neb. +Adm. Adm. +U.S.S.R. U.S.S.R. +Rev. Rev. +H.F. H.F. diff --git a/docs/source/index.rst b/docs/source/index.rst index 97681bfd8..fb738aa32 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,45 +3,228 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. +================================ spaCy NLP Tokenizer and Lexicon ================================ -spaCy is a library for industrial strength NLP in Python. Its core -values are: +spaCy is a library for industrial-strength NLP in Python and Cython. spaCy's +take on NLP is that it's mostly about feature extraction --- that's the part +that's specific to NLP, so that's what an NLP library should focus on. -* **Efficiency**: You won't find faster NLP tools. For shallow analysis, it's 10x - faster than Stanford Core NLP, and over 200x faster than NLTK. Its parser is - over 100x faster than Stanford's. +spaCy also believes that for NLP, **efficiency is critical**. If you're +running batch jobs, you probably have an enormous amount of data; if you're +serving requests one-by-one, you want lower latency and fewer servers. Even if +you're doing exploratory research on relatively small samples, you should still +value efficiency, because it means you can run more experiments. -* **Accuracy**: All spaCy tools are within 0.5% of the current published - state-of-the-art, on both news and web text. NLP moves fast, so always check - the numbers --- and don't settle for tools that aren't backed by - rigorous recent evaluation. +Depending on the task, spaCy is between 10 and 200 times faster than NLTK, +often with much better accuracy. See Benchmarks for details, and +Why is spaCy so fast? for a discussion of the algorithms and implementation +that makes this possible. -* **Minimalism**: This isn't a library that covers 43 known algorithms to do X. You - get 1 --- the best one --- with a simple, low-level interface. This keeps the - code-base small and concrete. Our Python APIs use lists and - dictionaries, and our C/Cython APIs use arrays and simple structs. ++---------+----------+-------------+----------+ +| System | Tokenize | --> Counts | --> Stem | ++---------+----------+-------------+----------+ +| spaCy | 1m42s | 1m59s | 1m59s | ++---------+----------+-------------+----------+ +| NLTK | 20m2s | 28m24s | 52m28 | ++---------+----------+-------------+----------+ + +Times for 100m words of text. + + +Unique Lexicon-centric design +============================= + +spaCy helps you build models that generalise better, by making it easy to use +more robust features. Instead of a list of strings, the tokenizer returns +references to rich lexical types. Features which ask about the word's Brown cluster, +its typical part-of-speech tag, how it's usually cased etc require no extra effort: + + >>> from spacy.en import EN + >>> from spacy.feature_names import * + >>> feats = ( + SIC, # ID of the original word form + STEM, # ID of the stemmed word form + CLUSTER, # ID of the word's Brown cluster + IS_TITLE, # Was the word title-cased? + POS_TYPE # A cluster ID describing what POS tags the word is usually assigned + ) + >>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^') + >>> tokens.to_array(feats)[:5] + array([[ 1, 2, 3, 4], + [...], + [...], + [...]]) + + +spaCy is designed to **make the right thing easy**, where the right thing is to: + +* **Use rich distributional and orthographic features**. Without these, your model + will be very brittle and domain dependent. + +* **Compute features per type, not per token**. Because of Zipf's law, you can + expect this to be exponentially more efficient. + +* **Minimize string processing**, and instead compute with arrays of ID ints. +For the current list of lexical features, see `Lexical Features`_. -Comparison ----------- +.. _lexical features: features.html -+----------------+-------------+--------+---------------+--------------+ -| Tokenize & Tag | Speed (w/s) | Memory | % Acc. (news) | % Acc. (web) | -+----------------+-------------+--------+---------------+--------------+ -| spaCy | 107,000 | 1.3gb | 96.7 | | -+----------------+-------------+--------+---------------+--------------+ -| Stanford | 8,000 | 1.5gb | 96.7 | | -+----------------+-------------+--------+---------------+--------------+ -| NLTK | 543 | 61mb | 94.0 | | -+----------------+-------------+--------+---------------+--------------+ +Tokenization done right +======================= + +Most tokenizers rely on complicated regular expressions. Often, they leave you +with no way to align the tokens back to the original string --- a vital feature +if you want to display some mark-up, such as spelling correction. The regular +expressions also interact, making it hard to accommodate special cases. + +spaCy introduces a **novel tokenization algorithm** that's much faster and much +more flexible: + +.. code-block:: python + + def tokenize(string, prefixes={}, suffixes={}, specials={}): + '''Sketch of spaCy's tokenization algorithm.''' + tokens = [] + cache = {} + for chunk in string.split(): + # Because of Zipf's law, the cache serves the majority of "chunks". + if chunk in cache: + tokens.extend(cache[chunl]) + continue + key = chunk + + subtokens = [] + # Process a chunk by splitting off prefixes e.g. ( " { and suffixes e.g. , . : + # If we split one off, check whether we're left with a special-case, + # e.g. contractions (can't, won't, etc), emoticons, abbreviations, etc. + # This makes the tokenization easy to update and customize. + while chunk: + prefix, chunk = _consume_prefix(chunk, prefixes) + if prefix: + subtokens.append(prefix) + if chunk in specials: + subtokens.extend(specials[chunk]) + break + suffix, chunk = _consume_suffix(chunk, suffixes) + if suffix: + subtokens.append(suffix) + if chunk in specials: + subtokens.extend(specials[chunk]) + break + cache[key] = subtokens + +Your data is going to have its own quirks, so it's really useful to have +a tokenizer you can easily control. To see the limitations of the standard +regex-based approach, check out `CMU's recent work on tokenizing tweets `_. Despite a lot of careful attention, they can't handle all of their +known emoticons correctly --- doing so would interfere with the way they +process other punctuation. This isn't a problem for spaCy: we just add them +all to the special tokenization rules. + +spaCy's tokenizer is also incredibly efficient: + +spaCy can create an inverted index of the 1.8 billion word Gigaword corpus, +in under half an hour --- on a Macbook Air. See the `inverted +index tutorial`_. + +.. _inverted index tutorial: index_tutorial.html + +Comparison with NLTK +==================== + +`NLTK `_ provides interfaces to a wide-variety of NLP +tools and resources, and its own implementations of a few algorithms. It comes +with comprehensive documentation, and a book introducing concepts in NLP. For +these reasons, it's very widely known. However, if you're trying to make money +or do cutting-edge research, NLTK is not a good choice. + +The `list of stuff in NLTK `_ looks impressive, +but almost none of it is useful for real work. You're not going to make any money, +or do top research, by using the NLTK chat bots, theorem provers, toy CCG implementation, +etc. Most of NLTK is there to assist in the explanation ideas in computational +linguistics, at roughly an undergraduate level. +But it also claims to support serious work, by wrapping external tools. + +In a pretty well known essay, Joel Spolsky discusses the pain of dealing with +`leaky abstractions `_. +An abstraction tells you to not care about implementation +details, but sometimes the implementation matters after all. When it +does, you have to waste time revising your assumptions. + +NLTK's wrappers call external tools via subprocesses, and wrap this up so +that it looks like a native API. This abstraction leaks *a lot*. The system +calls impose far more overhead than a normal Python function call, which makes +the most natural way to program against the API infeasible. + + +Case study: POS tagging +----------------------- + +Here's a quick comparison of the following POS taggers: + +* **Stanford (CLI)**: The Stanford POS tagger, invoked once as a batch process + from the command-line; +* **nltk.tag.stanford**: The Stanford tagger, invoked document-by-document via + NLTK's wrapper; +* **nltk.pos_tag**: NLTK's own POS tagger, invoked document-by-document. +* **spacy.en.pos_tag**: spaCy's POS tagger, invoked document-by-document. + + ++-------------------+-------------+--------+ +| System | Speed (w/s) | % Acc. | ++-------------------+-------------+--------+ +| spaCy | 107,000 | 96.7 | ++-------------------+-------------+--------+ +| Stanford (CLI) | 8,000 | 96.7 | ++-------------------+-------------+--------+ +| nltk.pos_tag | 543 | 94.0 | ++-------------------+-------------+--------+ +| nltk.tag.stanford | 209 | 96.7 | ++-------------------+-------------+--------+ + +Experimental details TODO. Three things are apparent from this comparison: + +1. The native NLTK tagger, nltk.pos_tag, is both slow and inaccurate; + +2. Calling the Stanford tagger document-by-document via NLTK is **40x** slower + than invoking the model once as a batch process, via the command-line; + +3. spaCy is over 10x faster than the Stanford tagger, even when called + **sentence-by-sentence**. + +The problem is that NLTK simply wraps the command-line +interfaces of these tools, so communication is via a subprocess. NLTK does not +even hold open a pipe for you --- the model is reloaded, again and again. + +To use the wrapper effectively, you should batch up your text as much as possible. +This probably isn't how you would like to structure your pipeline, and you +might not be able to batch up much text at all, e.g. if serving a single +request means processing a single document. +Technically, NLTK does give you Python functions to access lots of different +systems --- but, you can't use them as you would expect to use a normal Python +function. The abstraction leaks. + +Here's the bottom-line: the Stanford tools are written in Java, so using them +from Python sucks. You shouldn't settle for this. It's a problem that springs +purely from the tooling, rather than the domain. + +Summary +------- + +NLTK is a well-known Python library for NLP, but for the important bits, you +don't get actual Python modules. You get wrappers which throw to external +tools, via subprocesses. This is not at all the same thing. + +spaCy is implemented in Cython, just like numpy, scikit-learn, lxml and other +high-performance Python libraries. So you get a native Python API, but the +performance you expect from a program written in C. .. toctree:: :hidden: :maxdepth: 3 - - what/index.rst - why/index.rst - how/index.rst + + features.rst + license_stories.rst diff --git a/setup.py b/setup.py index c67bed4a1..827d44fc6 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,8 @@ import os.path from os import path from glob import glob +import numpy + def clean(ext): for pyx in ext.sources: @@ -34,7 +36,7 @@ compile_args = [] link_args = [] libs = [] -includes = ['.'] +includes = ['.', numpy.get_include()] cython_includes = ['.'] @@ -50,18 +52,20 @@ exts = [ Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes), Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), - Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes), - Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes), Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes), - Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes), + Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes), + Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes), + Extension("spacy.morphology", ["spacy/morphology.pyx"], language="c++", + include_dirs=includes), + #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes), - Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes), + #Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes), ] diff --git a/spacy/context.pxd b/spacy/context.pxd deleted file mode 100644 index 8f798d347..000000000 --- a/spacy/context.pxd +++ /dev/null @@ -1,66 +0,0 @@ -from thinc.typedefs cimport atom_t -from .typedefs cimport hash_t -from .tokens cimport Tokens -from .lexeme cimport Lexeme - - -cdef class Token: - cdef readonly atom_t sic - cdef readonly atom_t cluster - cdef readonly atom_t norm - cdef readonly atom_t shape - cdef readonly atom_t asciied - cdef readonly atom_t prefix - cdef readonly atom_t suffix - cdef readonly atom_t length - - cdef readonly atom_t postype - cdef readonly atom_t nertype - cdef readonly atom_t sensetype - - cdef readonly atom_t is_alpha - cdef readonly atom_t is_ascii - cdef readonly atom_t is_digit - cdef readonly atom_t is_lower - cdef readonly atom_t is_punct - cdef readonly atom_t is_space - cdef readonly atom_t is_title - cdef readonly atom_t is_upper - cdef readonly atom_t like_url - cdef readonly atom_t like_number - cdef readonly atom_t oft_lower - cdef readonly atom_t oft_title - cdef readonly atom_t oft_upper - - cdef readonly atom_t in_males - cdef readonly atom_t in_females - cdef readonly atom_t in_surnames - cdef readonly atom_t in_places - cdef readonly atom_t in_games - cdef readonly atom_t in_celebs - cdef readonly atom_t in_names - - cdef readonly atom_t pos - cdef readonly atom_t sense - cdef readonly atom_t ner - - -cdef class Slots: - cdef readonly Token P4 - cdef readonly Token P3 - cdef readonly Token P2 - cdef readonly Token P1 - cdef readonly Token N0 - cdef readonly Token N1 - cdef readonly Token N2 - cdef readonly Token N3 - cdef readonly Token N4 - - -cdef int N_FIELDS - - -cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1 - - -cpdef Slots FIELD_IDS diff --git a/spacy/context.pyx b/spacy/context.pyx deleted file mode 100644 index aeb78ae5c..000000000 --- a/spacy/context.pyx +++ /dev/null @@ -1,126 +0,0 @@ -from murmurhash.mrmr cimport hash64 -from .lexeme cimport * - - -cdef class Slots: - def __init__(self): - self.P4 = Token() - self.P3 = Token() - self.P2 = Token() - self.P1 = Token() - self.N0 = Token() - self.N1 = Token() - self.N2 = Token() - self.N3 = Token() - self.N4 = Token() - - -cdef void _number_token(Token t, int* n_fields): - cdef int i = n_fields[0] - t.sic = i; i += 1 - t.cluster = i; i += 1 - t.norm = i; i += 1 - t.shape = i; i += 1 - t.prefix = i; i += 1 - t.suffix = i; i += 1 - t.length = i; i += 1 - - t.postype = i; i += 1 - t.nertype = i; i += 1 - t.sensetype = i; i += 1 - - t.is_alpha = i; i += 1 - t.is_ascii = i; i += 1 - t.is_digit = i; i += 1 - t.is_lower = i; i += 1 - t.is_punct = i; i += 1 - t.is_space = i; i += 1 - t.is_title = i; i += 1 - t.is_upper = i; i += 1 - - t.like_number = i; i += 1 - t.like_url = i; i += 1 - - t.oft_lower = i; i += 1 - t.oft_title = i; i += 1 - t.oft_upper = i; i += 1 - - t.in_males = i; i += 1 - t.in_females = i; i += 1 - t.in_surnames = i; i += 1 - t.in_places = i; i += 1 - t.in_games = i; i += 1 - t.in_celebs = i; i += 1 - t.in_names = i; i += 1 - - t.pos = i; i += 1 - t.sense = i; i += 1 - t.ner = i; i += 1 - - n_fields[0] = i - - -cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner): - c[t.sic] = lex.sic - c[t.cluster] = lex.cluster - c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape - c[t.shape] = lex.shape - c[t.asciied] = lex.asciied - c[t.prefix] = lex.prefix - c[t.suffix] = lex.suffix - c[t.length] = lex.length - - c[t.postype] = lex.postype - c[t.nertype] = 0 - c[t.sensetype] = 0 - - c[t.is_alpha] = lex.flags & (1 << IS_ALPHA) - c[t.is_digit] = lex.flags & (1 << IS_DIGIT) - c[t.is_lower] = lex.flags & (1 << IS_LOWER) - c[t.is_punct] = lex.flags & (1 << IS_PUNCT) - c[t.is_space] = lex.flags & (1 << IS_SPACE) - c[t.is_title] = lex.flags & (1 << IS_TITLE) - c[t.is_upper] = lex.flags & (1 << IS_UPPER) - c[t.like_url] = lex.flags & (1 << LIKE_URL) - c[t.like_number] = lex.flags & (1 << LIKE_NUMBER) - c[t.oft_lower] = lex.flags & (1 << OFT_LOWER) - c[t.oft_title] = lex.flags & (1 << OFT_TITLE) - c[t.oft_upper] = lex.flags & (1 << OFT_UPPER) - - c[t.in_males] = lex.flags & (1 << IN_MALES) - c[t.in_females] = lex.flags & (1 << IN_FEMALES) - c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES) - c[t.in_places] = lex.flags & (1 << IN_PLACES) - c[t.in_games] = lex.flags & (1 << IN_GAMES) - c[t.in_celebs] = lex.flags & (1 << IN_CELEBS) - c[t.in_names] = lex.flags & (1 << IN_NAMES) - - c[t.pos] = pos - c[t.sense] = 0 - c[t.ner] = ner - - -cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1: - _fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4]) - _fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3]) - _fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2]) - _fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1]) - _fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i]) - _fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1]) - _fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2]) - _fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3]) - _fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4]) - return 1 - - -N_FIELDS = 0 -FIELD_IDS = Slots() -_number_token(FIELD_IDS.P4, &N_FIELDS) -_number_token(FIELD_IDS.P3, &N_FIELDS) -_number_token(FIELD_IDS.P2, &N_FIELDS) -_number_token(FIELD_IDS.P1, &N_FIELDS) -_number_token(FIELD_IDS.N0, &N_FIELDS) -_number_token(FIELD_IDS.N1, &N_FIELDS) -_number_token(FIELD_IDS.N2, &N_FIELDS) -_number_token(FIELD_IDS.N3, &N_FIELDS) -_number_token(FIELD_IDS.N4, &N_FIELDS) diff --git a/spacy/en.pxd b/spacy/en.pxd index a7c643eba..2ca081e47 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -1,5 +1,133 @@ -from spacy.lang cimport Language -from spacy.tokens cimport Tokens +from thinc.typedefs cimport atom_t + +from .lang cimport Language +from .tokens cimport Tokens +from .tokens cimport TokenC + + +cpdef enum en_person_t: + NO_PERSON + FIRST + SECOND + THIRD + NON_THIRD + + +cpdef enum en_number_t: + NO_NUMBER + SINGULAR + PLURAL + MASS + + +cpdef enum en_gender_t: + NO_GENDER + MASCULINE + FEMININE + NEUTER + + +cpdef enum en_case_t: + NO_CASE + NOMINATIVE + GENITIVE + ACCUSATIVE + REFLEXIVE + DEMONYM + + +cpdef enum en_tenspect_t: + NO_TENSE + BASE_VERB + PRESENT + PAST + PASSIVE + ING + MODAL + + +cpdef enum misc_t: + NO_MISC + COMPARATIVE + SUPERLATIVE + RELATIVE + NAME + + +# Flags +cpdef enum FlagID: + IS_ALPHA + IS_ASCII + IS_DIGIT + IS_LOWER + IS_PUNCT + IS_SPACE + IS_TITLE + IS_UPPER + + LIKE_URL + LIKE_NUMBER + + OFT_LOWER + OFT_TITLE + OFT_UPPER + + IN_MALES + IN_FEMALES + IN_SURNAMES + IN_PLACES + IN_GAMES + IN_CELEBS + IN_NAMES + + +cpdef enum: + P2_sic + P2_cluster + P2_shape + P2_prefix + P2_suffix + P2_pos + P2_lemma + P2_pos_type + + P1_sic + P1_cluster + P1_shape + P1_prefix + P1_suffix + P1_pos + P1_lemma + P1_pos_type + + W_sic + W_cluster + W_shape + W_prefix + W_suffix + W_pos + W_lemma + W_pos_type + + N1_sic + N1_cluster + N1_shape + N1_prefix + N1_suffix + N1_pos + N1_lemma + N1_pos_type + + N2_sic + N2_cluster + N2_shape + N2_prefix + N2_suffix + N2_pos + N2_lemma + N2_pos_type + + N_CONTEXT_FIELDS cdef class English(Language): diff --git a/spacy/en.pyx b/spacy/en.pyx index 95c1cbd94..3ed0eaaa9 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -30,14 +30,101 @@ same scheme. Tokenization problems are a major cause of poor performance for NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module provides a fully Penn Treebank 3-compliant tokenizer. ''' -# TODO -#The script translate_treebank_tokenization can be used to transform a treebank's -#annotation to use one of the spacy tokenization schemes. - - from __future__ import unicode_literals cimport lang +from .typedefs cimport flags_t +import orth +from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB +from .morphology cimport X, PUNCT, EOL + +from .tokens cimport Morphology + + +POS_TAGS = { + 'NULL': (NO_TAG, {}), + 'EOL': (EOL, {}), + 'CC': (CONJ, {}), + 'CD': (NUM, {}), + 'DT': (DET, {}), + 'EX': (DET, {}), + 'FW': (X, {}), + 'IN': (ADP, {}), + 'JJ': (ADJ, {}), + 'JJR': (ADJ, {'misc': COMPARATIVE}), + 'JJS': (ADJ, {'misc': SUPERLATIVE}), + 'LS': (X, {}), + 'MD': (VERB, {'tenspect': MODAL}), + 'NN': (NOUN, {}), + 'NNS': (NOUN, {'number': PLURAL}), + 'NNP': (NOUN, {'misc': NAME}), + 'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}), + 'PDT': (DET, {}), + 'POS': (PRT, {'case': GENITIVE}), + 'PRP': (NOUN, {}), + 'PRP$': (NOUN, {'case': GENITIVE}), + 'RB': (ADV, {}), + 'RBR': (ADV, {'misc': COMPARATIVE}), + 'RBS': (ADV, {'misc': SUPERLATIVE}), + 'RP': (PRT, {}), + 'SYM': (X, {}), + 'TO': (PRT, {}), + 'UH': (X, {}), + 'VB': (VERB, {}), + 'VBD': (VERB, {'tenspect': PAST}), + 'VBG': (VERB, {'tenspect': ING}), + 'VBN': (VERB, {'tenspect': PASSIVE}), + 'VBP': (VERB, {'tenspect': PRESENT}), + 'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}), + 'WDT': (DET, {'misc': RELATIVE}), + 'WP': (PRON, {'misc': RELATIVE}), + 'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}), + 'WRB': (ADV, {'misc': RELATIVE}), + '!': (PUNCT, {}), + '#': (PUNCT, {}), + '$': (PUNCT, {}), + "''": (PUNCT, {}), + "(": (PUNCT, {}), + ")": (PUNCT, {}), + "-LRB-": (PUNCT, {}), + "-RRB-": (PUNCT, {}), + ".": (PUNCT, {}), + ",": (PUNCT, {}), + "``": (PUNCT, {}), + ":": (PUNCT, {}), + "?": (PUNCT, {}), +} + + +POS_TEMPLATES = ( + (W_sic,), + (P1_lemma, P1_pos), + (P2_lemma, P2_pos), + (N1_sic,), + (N2_sic,), + + (W_suffix,), + (W_prefix,), + + (P1_pos,), + (P2_pos,), + (P1_pos, P2_pos), + (P1_pos, W_sic), + (P1_suffix,), + (N1_suffix,), + + (W_shape,), + (W_cluster,), + (N1_cluster,), + (N2_cluster,), + (P1_cluster,), + (P2_cluster,), + + (W_pos_type,), + (N1_pos_type,), + (N1_pos_type,), + (P1_pos, W_pos_type, N1_pos_type), +) cdef class English(Language): @@ -47,7 +134,68 @@ cdef class English(Language): name (unicode): The two letter code used by Wikipedia for the language. lexicon (Lexicon): The lexicon. Exposes the lookup method. """ - pass + def get_props(self, unicode string): + return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)} + + def set_flags(self, unicode string): + cdef flags_t flags = 0 + flags |= orth.is_alpha(string) << IS_ALPHA + flags |= orth.is_ascii(string) << IS_ASCII + flags |= orth.is_digit(string) << IS_DIGIT + flags |= orth.is_lower(string) << IS_LOWER + flags |= orth.is_punct(string) << IS_PUNCT + flags |= orth.is_space(string) << IS_SPACE + flags |= orth.is_title(string) << IS_TITLE + flags |= orth.is_upper(string) << IS_UPPER + + flags |= orth.like_url(string) << LIKE_URL + flags |= orth.like_number(string) << LIKE_NUMBER + return flags + + def set_pos(self, Tokens tokens): + cdef int i + cdef atom_t[N_CONTEXT_FIELDS] context + cdef TokenC* t = tokens.data + assert self.morphologizer is not None + cdef dict tagdict = self.pos_tagger.tagdict + for i in range(tokens.length): + if t[i].lex.sic in tagdict: + t[i].pos = tagdict[t[i].lex.sic] + else: + fill_pos_context(context, i, t) + t[i].pos = self.pos_tagger.predict(context) + self.morphologizer.set_morph(i, t) + + def train_pos(self, Tokens tokens, golds): + cdef int i + cdef atom_t[N_CONTEXT_FIELDS] context + c = 0 + cdef TokenC* t = tokens.data + for i in range(tokens.length): + fill_pos_context(context, i, t) + t[i].pos = self.pos_tagger.predict(context, [golds[i]]) + self.morphologizer.set_morph(i, t) + c += t[i].pos == golds[i] + return c + + +cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1: + _fill_from_token(&context[P2_sic], &tokens[i-2]) + _fill_from_token(&context[P1_sic], &tokens[i-1]) + _fill_from_token(&context[W_sic], &tokens[i]) + _fill_from_token(&context[N1_sic], &tokens[i+1]) + _fill_from_token(&context[N2_sic], &tokens[i+2]) + + +cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: + context[0] = t.lex.sic + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.pos + context[6] = t.lemma + context[7] = t.lex.pos_type EN = English('en') diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 68f1ee58a..20986f134 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -1,38 +1,38 @@ from libcpp.vector cimport vector +from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER + from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .tokens cimport Tokens +from .tokens cimport Tokens, TokenC from .lexeme cimport Lexeme from .tagger cimport Tagger -from .ner.greedy_parser cimport NERParser -from .utf8string cimport StringStore +from .utf8string cimport StringStore, UniStr +from .morphology cimport Morphologizer -cdef extern from "Python.h": - cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch) - cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch) - cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch) - cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch) +cdef union LexemesOrTokens: + const Lexeme* const* lexemes + TokenC* tokens -cdef struct String: - Py_UNICODE* chars - size_t n - hash_t key +cdef struct Cached: + LexemesOrTokens data + bint is_lex + int length cdef class Lexicon: + cpdef public get_lex_props cdef Pool mem - cpdef readonly size_t size cpdef readonly StringStore strings cdef vector[Lexeme*] lexemes - cdef Lexeme* get(self, String* s) except NULL + cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL - cdef PreshMap _dict + cdef PreshMap _map cdef class Language: @@ -41,9 +41,8 @@ cdef class Language: cdef PreshMap _cache cdef PreshMap _specials cpdef readonly Lexicon lexicon - cpdef readonly Tagger pos_tagger - cpdef readonly NERParser ner_tagger + cpdef readonly Morphologizer morphologizer cdef object _prefix_re cdef object _suffix_re @@ -52,13 +51,14 @@ cdef class Language: cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokenize(self, unicode text) - cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 - cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, + cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1 + cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 + cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except NULL - cdef int _attach_tokens(self, Tokens tokens, int idx, String* string, + cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 - cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1 + cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 79a84e936..4617c3853 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -18,13 +18,14 @@ from preshed.maps cimport PreshMap from .lexeme cimport Lexeme from .lexeme cimport EMPTY_LEXEME from .lexeme cimport init as lexeme_init +from .lexeme cimport check_flag + +from .utf8string cimport slice_unicode from . import util from .util import read_lang_data from .tokens import Tokens - -from .tagger cimport Tagger -from .ner.greedy_parser cimport NERParser +from .tokens cimport Morphology cdef class Language: @@ -37,29 +38,30 @@ cdef class Language: self._prefix_re = re.compile(prefix) self._suffix_re = re.compile(suffix) self._infix_re = re.compile(infix) - self.lexicon = Lexicon() - if path.exists(path.join(util.DATA_DIR, name, 'lexemes')): - self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) - self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) + self.lexicon = Lexicon(self.get_props) self._load_special_tokenization(rules) - if path.exists(path.join(util.DATA_DIR, name, 'pos')): - self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos')) - else: - self.pos_tagger = None - if path.exists(path.join(util.DATA_DIR, name, 'ner')): - self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner')) + self.pos_tagger = None + self.morphologizer = None + + def load(self): + self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes')) + self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings')) + if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): + self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) + self.morphologizer = Morphologizer(self.lexicon.strings, + path.join(util.DATA_DIR, self.name)) cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) - cdef Tokens tokens = Tokens(self.lexicon.strings, length) + cdef Tokens tokens = Tokens(self, length) if length == 0: return tokens - cdef String string_struct + cdef UniStr string_struct cdef unicode py_string cdef int idx = 0 for i, py_string in enumerate(strings): - string_from_unicode(&string_struct, py_string) - tokens.push_back(idx, self.lexicon.get(&string_struct)) + slice_unicode(&string_struct, py_string, 0, len(py_string)) + tokens.push_back(idx, self.lexicon.get(tokens.mem, &string_struct)) idx += len(py_string) + 1 return tokens @@ -79,22 +81,21 @@ cdef class Language: tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. """ cdef int length = len(string) - cdef Tokens tokens = Tokens(self.lexicon.strings, length) + cdef Tokens tokens = Tokens(self, length) if length == 0: return tokens cdef int i = 0 cdef int start = 0 + cdef bint cache_hit cdef Py_UNICODE* chars = string cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) - cdef String span + cdef UniStr span for i in range(1, length): if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if start < i: - string_slice(&span, chars, start, i) - lexemes = self._cache.get(span.key) - if lexemes != NULL: - tokens.extend(start, lexemes, 0) - else: + slice_unicode(&span, chars, start, i) + cache_hit = self._try_cache(start, span.key, tokens) + if not cache_hit: self._tokenize(tokens, &span, start, i) in_ws = not in_ws start = i @@ -102,15 +103,27 @@ cdef class Language: start += 1 i += 1 if start < i: - string_slice(&span, chars, start, i) - lexemes = self._cache.get(span.key) - if lexemes != NULL: - tokens.extend(start, lexemes, 0) - else: + slice_unicode(&span, chars, start, i) + cache_hit = self._try_cache(start, span.key, tokens) + if not cache_hit: self._tokenize(tokens, &span, start, i) return tokens - cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: + cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: + #cached = self._specials.get(key) + cached = self._cache.get(key) + if cached == NULL: + return False + cdef int i + if cached.is_lex: + for i in range(cached.length): + idx = tokens.push_back(idx, cached.data.lexemes[i]) + else: + for i in range(cached.length): + idx = tokens.push_back(idx, &cached.data.tokens[i]) + return True + + cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: cdef vector[Lexeme*] prefixes cdef vector[Lexeme*] suffixes cdef hash_t orig_key @@ -119,88 +132,95 @@ cdef class Language: orig_size = tokens.length self._split_affixes(span, &prefixes, &suffixes) self._attach_tokens(tokens, start, span, &prefixes, &suffixes) - self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) + self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size) - cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, - vector[Lexeme*] *suffixes) except NULL: + cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes, + vector[const Lexeme*] *suffixes) except NULL: cdef size_t i - cdef String prefix - cdef String suffix - cdef String minus_pre - cdef String minus_suf + cdef UniStr prefix + cdef UniStr suffix + cdef UniStr minus_pre + cdef UniStr minus_suf cdef size_t last_size = 0 while string.n != 0 and string.n != last_size: last_size = string.n pre_len = self._find_prefix(string.chars, string.n) if pre_len != 0: - string_slice(&prefix, string.chars, 0, pre_len) - string_slice(&minus_pre, string.chars, pre_len, string.n) + slice_unicode(&prefix, string.chars, 0, pre_len) + slice_unicode(&minus_pre, string.chars, pre_len, string.n) # Check whether we've hit a special-case if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL: string[0] = minus_pre - prefixes.push_back(self.lexicon.get(&prefix)) + prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix)) break suf_len = self._find_suffix(string.chars, string.n) if suf_len != 0: - string_slice(&suffix, string.chars, string.n - suf_len, string.n) - string_slice(&minus_suf, string.chars, 0, string.n - suf_len) + slice_unicode(&suffix, string.chars, string.n - suf_len, string.n) + slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len) # Check whether we've hit a special-case if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL: string[0] = minus_suf - suffixes.push_back(self.lexicon.get(&suffix)) + suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= string.n: - string_slice(string, string.chars, pre_len, string.n - suf_len) - prefixes.push_back(self.lexicon.get(&prefix)) - suffixes.push_back(self.lexicon.get(&suffix)) + slice_unicode(string, string.chars, pre_len, string.n - suf_len) + prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix)) + suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix)) elif pre_len: string[0] = minus_pre - prefixes.push_back(self.lexicon.get(&prefix)) + prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix)) elif suf_len: string[0] = minus_suf - suffixes.push_back(self.lexicon.get(&suffix)) + suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix)) if self._specials.get(string.key): break return string - cdef int _attach_tokens(self, Tokens tokens, - int idx, String* string, - vector[Lexeme*] *prefixes, - vector[Lexeme*] *suffixes) except -1: + cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, + vector[const Lexeme*] *prefixes, + vector[const Lexeme*] *suffixes) except -1: + cdef bint cache_hit cdef int split - cdef Lexeme** lexemes + cdef const Lexeme* const* lexemes cdef Lexeme* lexeme - cdef String span + cdef UniStr span + cdef int i if prefixes.size(): - idx = tokens.extend(idx, prefixes.data(), prefixes.size()) + for i in range(prefixes.size()): + idx = tokens.push_back(idx, prefixes[0][i]) if string.n != 0: - - lexemes = self._cache.get(string.key) - if lexemes != NULL: - idx = tokens.extend(idx, lexemes, 0) + cache_hit = self._try_cache(idx, string.key, tokens) + if cache_hit: + idx = tokens.data[tokens.length - 1].idx + 1 else: split = self._find_infix(string.chars, string.n) if split == 0 or split == -1: - idx = tokens.push_back(idx, self.lexicon.get(string)) + idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, string)) else: - string_slice(&span, string.chars, 0, split) - idx = tokens.push_back(idx, self.lexicon.get(&span)) - string_slice(&span, string.chars, split, split+1) - idx = tokens.push_back(idx, self.lexicon.get(&span)) - string_slice(&span, string.chars, split + 1, string.n) - idx = tokens.push_back(idx, self.lexicon.get(&span)) - cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin() + slice_unicode(&span, string.chars, 0, split) + idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span)) + slice_unicode(&span, string.chars, split, split+1) + idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span)) + slice_unicode(&span, string.chars, split + 1, string.n) + idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span)) + cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): idx = tokens.push_back(idx, deref(it)) preinc(it) - cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1: - lexemes = self.mem.alloc(n + 1, sizeof(Lexeme**)) + cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: cdef int i for i in range(n): - lexemes[i] = tokens[i] - lexemes[i + 1] = NULL - self._cache.set(key, lexemes) + if tokens[i].lex.id == 1: + return 0 + cached = self.mem.alloc(1, sizeof(Cached)) + cached.length = n + cached.is_lex = True + lexemes = self.mem.alloc(n, sizeof(Lexeme**)) + for i in range(n): + lexemes[i] = tokens[i].lex + cached.data.lexemes = lexemes + self._cache.set(key, cached) cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1: cdef unicode string = chars[:length] @@ -217,66 +237,120 @@ cdef class Language: match = self._suffix_re.search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, token_rules): - '''Load special-case tokenization rules. - - Loads special-case tokenization rules into the Language._cache cache, - read from data//tokenization . The special cases are loaded before - any language data is tokenized, giving these priority. For instance, - the English tokenization rules map "ain't" to ["are", "not"]. - - Args: - token_rules (list): A list of (chunk, tokens) pairs, where chunk is - a string and tokens is a list of strings. + def _load_special_tokenization(self, object rules): + '''Add a special-case tokenization rule. ''' + cdef int i + cdef unicode chunk + cdef list substrings + cdef unicode form + cdef unicode lemma + cdef dict props cdef Lexeme** lexemes cdef hash_t hashed - cdef String string - for uni_string, substrings in token_rules: - lexemes = self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) - for i, substring in enumerate(substrings): - string_from_unicode(&string, substring) - lexemes[i] = self.lexicon.get(&string) - lexemes[i + 1] = NULL - string_from_unicode(&string, uni_string) - self._specials.set(string.key, lexemes) - self._cache.set(string.key, lexemes) + cdef UniStr string + for chunk, substrings in sorted(rules.items()): + tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) + for i, props in enumerate(substrings): + form = props['F'] + lemma = props.get("L", None) + slice_unicode(&string, form, 0, len(form)) + tokens[i].lex = self.lexicon.get(self.lexicon.mem, &string) + if lemma: + tokens[i].lemma = self.lexicon.strings[lemma] + set_morph_from_dict(&tokens[i].morph, props) + cached = self.mem.alloc(1, sizeof(Cached)) + cached.length = len(substrings) + cached.is_lex = False + cached.data.tokens = tokens + slice_unicode(&string, chunk, 0, len(chunk)) + self._specials.set(string.key, cached) + self._cache.set(string.key, cached) + + +cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: + morph.number = props.get('number', 0) + morph.tenspect = props.get('tenspect', 0) + morph.mood = props.get('mood', 0) + morph.gender = props.get('gender', 0) + morph.person = props.get('person', 0) + morph.case = props.get('case', 0) + morph.misc = props.get('misc', 0) cdef class Lexicon: - def __init__(self): + '''A map container for a language's Lexeme structs. + + Also interns UTF-8 strings, and maps them to consecutive integer IDs. + ''' + def __init__(self, object get_props): self.mem = Pool() - self._dict = PreshMap(2 ** 20) + self._map = PreshMap(2 ** 20) self.strings = StringStore() self.lexemes.push_back(&EMPTY_LEXEME) - self.size = 1 + self.get_lex_props = get_props - cdef Lexeme* get(self, String* string) except NULL: + def __len__(self): + return self.lexemes.size() + + cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: + '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme + if necessary, using memory acquired from the given pool. If the pool + is the lexicon's own memory, the lexeme is saved in the lexicon.''' cdef Lexeme* lex - lex = self._dict.get(string.key) + lex = self._map.get(string.key) if lex != NULL: return lex - lex = self.mem.alloc(sizeof(Lexeme), 1) - lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {}) - self._dict.set(string.key, lex) - while self.lexemes.size() < (lex.id + 1): - self.lexemes.push_back(&EMPTY_LEXEME) - self.lexemes[lex.id] = lex - self.size += 1 + if string.n < 3: + mem = self.mem + cdef unicode py_string = string.chars[:string.n] + lex = mem.alloc(sizeof(Lexeme), 1) + lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings, + self.get_lex_props(py_string)) + if mem is self.mem: + self._map.set(string.key, lex) + while self.lexemes.size() < (lex.id + 1): + self.lexemes.push_back(&EMPTY_LEXEME) + self.lexemes[lex.id] = lex + else: + lex[0].id = 1 return lex def __getitem__(self, id_or_string): + '''Retrieve a lexeme, given an int ID or a unicode string. If a previously + unseen unicode string is given, a new Lexeme is created and stored. + + This function relies on Cython's struct-to-dict conversion. Python clients + receive a dict keyed by strings (byte or unicode, depending on Python 2/3), + with int values. Cython clients can instead receive a Lexeme struct value. + More efficient Cython access is provided by Lexicon.get, which returns + a Lexeme*. + + Args: + id_or_string (int or unicode): The integer ID of a word, or its unicode + string. If an int >= Lexicon.size, IndexError is raised. + If id_or_string is neither an int nor a unicode string, ValueError + is raised. + + Returns: + lexeme (dict): A Lexeme struct instance, which Cython translates into + a dict if the operator is called from Python. + ''' if type(id_or_string) == int: + if id_or_string >= self.lexemes.size(): + raise IndexError return self.lexemes.at(id_or_string)[0] - cdef String string - string_from_unicode(&string, id_or_string) - cdef Lexeme* lexeme = self.get(&string) + cdef UniStr string + slice_unicode(&string, id_or_string, 0, len(id_or_string)) + cdef const Lexeme* lexeme = self.get(self.mem, &string) return lexeme[0] def __setitem__(self, unicode uni_string, dict props): - cdef String s - string_from_unicode(&s, uni_string) - cdef Lexeme* lex = self.get(&s) + cdef UniStr s + slice_unicode(&s, uni_string, 0, len(uni_string)) + # Cast through the const here, since we're allowed to change our own + # Lexemes. + lex = self.get(self.mem, &s) lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) def dump(self, loc): @@ -287,11 +361,11 @@ cdef class Lexicon: assert fp != NULL cdef size_t st cdef hash_t key - for i in range(self._dict.length): - key = self._dict.c_map.cells[i].key + for i in range(self._map.length): + key = self._map.c_map.cells[i].key if key == 0: continue - lexeme = self._dict.c_map.cells[i].value + lexeme = self._map.c_map.cells[i].value st = fwrite(&key, sizeof(key), 1, fp) assert st == 1 st = fwrite(lexeme, sizeof(Lexeme), 1, fp) @@ -300,7 +374,8 @@ cdef class Lexicon: assert st == 0 def load(self, loc): - assert path.exists(loc) + if not path.exists(loc): + raise IOError('Lexemes file not found at %s' % loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef FILE* fp = fopen(bytes_loc, 'rb') assert fp != NULL @@ -316,21 +391,9 @@ cdef class Lexicon: st = fread(lexeme, sizeof(Lexeme), 1, fp) if st != 1: break - self._dict.set(key, lexeme) + self._map.set(key, lexeme) while self.lexemes.size() < (lexeme.id + 1): self.lexemes.push_back(&EMPTY_LEXEME) self.lexemes[lexeme.id] = lexeme i += 1 - self.size += 1 fclose(fp) - - -cdef void string_from_unicode(String* s, unicode uni): - cdef Py_UNICODE* c_uni = uni - string_slice(s, c_uni, 0, len(uni)) - - -cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil: - s.chars = &chars[start] - s.n = end - start - s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py new file mode 100644 index 000000000..ce9bbefdc --- /dev/null +++ b/spacy/lemmatizer.py @@ -0,0 +1,90 @@ +from os import path + + +NOUN_RULES = ( + ('s', ''), + ('ses', 's'), + ('ves', 'f'), + ('xes', 'x'), + ('zes', 'z'), + ('ches', 'ch'), + ('shes', 'sh'), + ('men', 'man'), + ('ies', 'y') +) + + +VERB_RULES = ( + ("s", ""), + ("ies", "y"), + ("es", "e"), + ("es", ""), + ("ed", "e"), + ("ed", ""), + ("ing", "e"), + ("ing", "") +) + + +ADJ_RULES = ( + ("er", ""), + ("est", ""), + ("er", "e"), + ("est", "e") +) + + +class Lemmatizer(object): + def __init__(self, wn_dict_dir): + self.index = {} + self.exc = {} + for pos in ['adj', 'adv', 'noun', 'verb']: + self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos)) + self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) + + def noun(self, string): + return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES) + + def verb(self, string): + return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES) + + def adj(self, string): + return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES) + + +def lemmatize(string, index, exceptions, rules): + string = string.lower() + forms = [] + if string in index: + forms.append(string) + forms.extend(exceptions.get(string, [])) + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if form in index: + forms.append(form) + if not forms: + forms.append(string) + return set(forms) + + +def read_index(loc): + index = set() + for line in open(loc): + if line.startswith(' '): + continue + pieces = line.split() + word = pieces[0] + if word.count('_') == 0: + index.add(word) + return index + + +def read_exc(loc): + exceptions = {} + for line in open(loc): + if line.startswith(' '): + continue + pieces = line.split() + exceptions[pieces[0]] = tuple(pieces[1:]) + return exceptions diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 0d7d206e5..a6f20906b 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,61 +1,137 @@ -from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t +from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t from .utf8string cimport StringStore -from libc.stdint cimport uint16_t -cpdef flag_t OOV_DIST_FLAGS -# Flags -cpdef enum: - IS_ALPHA - IS_ASCII - IS_DIGIT - IS_LOWER - IS_PUNCT - IS_SPACE - IS_TITLE - IS_UPPER +# Reserve 64 values for flag features +cpdef enum attr_id_t: + FLAG0 + FLAG1 + FLAG2 + FLAG3 + FLAG4 + FLAG5 + FLAG6 + FLAG7 + FLAG8 + FLAG9 + FLAG10 + FLAG11 + FLAG12 + FLAG13 + FLAG14 + FLAG15 + FLAG16 + FLAG17 + FLAG18 + FLAG19 + FLAG20 + FLAG21 + FLAG22 + FLAG23 + FLAG24 + FLAG25 + FLAG26 + FLAG27 + FLAG28 + FLAG29 + FLAG30 + FLAG31 + FLAG32 + FLAG33 + FLAG34 + FLAG35 + FLAG36 + FLAG37 + FLAG38 + FLAG39 + FLAG40 + FLAG41 + FLAG42 + FLAG43 + FLAG44 + FLAG45 + FLAG46 + FLAG47 + FLAG48 + FLAG49 + FLAG50 + FLAG51 + FLAG52 + FLAG53 + FLAG54 + FLAG55 + FLAG56 + FLAG57 + FLAG58 + FLAG59 + FLAG60 + FLAG61 + FLAG62 + FLAG63 - LIKE_URL - LIKE_NUMBER + ID + SIC + DENSE + SHAPE + PREFIX + SUFFIX - OFT_LOWER - OFT_TITLE - OFT_UPPER - - IN_MALES - IN_FEMALES - IN_SURNAMES - IN_PLACES - IN_GAMES - IN_CELEBS - IN_NAMES + LENGTH + CLUSTER + POS_TYPE + LEMMA cdef struct Lexeme: - flag_t flags + flags_t flags - id_t id - id_t sic - id_t norm - id_t shape - id_t asciied - id_t prefix - id_t suffix + attr_t id + attr_t sic + attr_t dense + attr_t shape + attr_t prefix + attr_t suffix + + attr_t length + attr_t cluster + attr_t pos_type float prob - - len_t length - tag_t cluster - tag_t postype - tag_t supersense + float sentiment cdef Lexeme EMPTY_LEXEME -cpdef Lexeme init(id_t i, unicode string, hash_t hashed, - StringStore store, dict props) except * + +cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store, + dict props) except * -cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil: +cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) + + +cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil: + if feat_name < (sizeof(flags_t) * 8): + return check_flag(lex, feat_name) + elif feat_name == ID: + return lex.id + elif feat_name == SIC: + return lex.sic + elif feat_name == DENSE: + return lex.dense + elif feat_name == SHAPE: + return lex.shape + elif feat_name == PREFIX: + return lex.prefix + elif feat_name == SUFFIX: + return lex.suffix + elif feat_name == LENGTH: + return lex.length + elif feat_name == CLUSTER: + return lex.cluster + elif feat_name == POS_TYPE: + return lex.pos_type + else: + return 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 64eb699a6..f1974cbc9 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -6,67 +6,25 @@ from libc.string cimport memset import orth -from .utf8string cimport Utf8Str - -OOV_DIST_FLAGS = 0 memset(&EMPTY_LEXEME, 0, sizeof(Lexeme)) -def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc): - cdef flag_t flags = 0 - flags |= orth.is_alpha(string) << IS_ALPHA - flags |= orth.is_ascii(string) << IS_ASCII - flags |= orth.is_digit(string) << IS_DIGIT - flags |= orth.is_lower(string) << IS_LOWER - flags |= orth.is_punct(string) << IS_PUNCT - flags |= orth.is_space(string) << IS_SPACE - flags |= orth.is_title(string) << IS_TITLE - flags |= orth.is_upper(string) << IS_UPPER - - flags |= orth.like_url(string) << LIKE_URL - flags |= orth.like_number(string) << LIKE_NUMBER - return flags - - cpdef Lexeme init(id_t i, unicode string, hash_t hashed, - StringStore store, dict props) except *: + StringStore string_store, dict props) except *: cdef Lexeme lex lex.id = i lex.length = len(string) - lex.sic = get_string_id(string, store) + lex.sic = string_store[string] lex.cluster = props.get('cluster', 0) - lex.postype = props.get('postype', 0) - lex.supersense = props.get('supersense', 0) + lex.pos_type = props.get('pos_type', 0) lex.prob = props.get('prob', 0) - cdef float upper_pc = props.get('upper_pc', 0.0) - cdef float lower_pc = props.get('lower_pc', 0.0) - cdef float title_pc = props.get('title_pc', 0.0) - - lex.prefix = get_string_id(string[0], store) - lex.suffix = get_string_id(string[-3:], store) - if upper_pc or lower_pc or title_pc: - canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc) - lex.norm = get_string_id(canon_cased, store) - else: - lex.norm = lex.sic - lex.shape = get_string_id(orth.word_shape(string), store) - lex.asciied = get_string_id(orth.asciied(string), store) - lex.flags = get_flags(string, upper_pc, title_pc, lower_pc) - - lex.flags |= props.get('in_males', 0) << IN_MALES - lex.flags |= props.get('in_females', 0) << IN_FEMALES - lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES - lex.flags |= props.get('in_places', 0) << IN_PLACES - lex.flags |= props.get('in_celebs', 0) << IN_CELEBS - lex.flags |= props.get('in_games', 0) << IN_GAMES - lex.flags |= props.get('in_names', 0) << IN_NAMES + lex.prefix = string_store[string[:1]] + lex.suffix = string_store[string[-3:]] + lex.shape = string_store[orth.word_shape(string)] + lex.dense = string_store[props['dense']] + + lex.flags = props.get('flags', 0) return lex - - -cdef id_t get_string_id(unicode string, StringStore store) except 0: - cdef bytes byte_string = string.encode('utf8') - cdef Utf8Str* orig_str = store.intern(byte_string, len(byte_string)) - return orig_str.i diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd new file mode 100644 index 000000000..9c5d342e9 --- /dev/null +++ b/spacy/morphology.pxd @@ -0,0 +1,45 @@ + +from .tokens cimport TokenC +from .lexeme cimport Lexeme +from .utf8string cimport StringStore +from .typedefs cimport id_t, Morphology + +from preshed.maps cimport PreshMapArray +from cymem.cymem cimport Pool + + +# Google universal tag set +cpdef enum univ_tag_t: + NO_TAG + ADJ + ADV + ADP + CONJ + DET + NOUN + NUM + PRON + PRT + VERB + X + PUNCT + EOL + N_UNIV_TAGS + + +cdef struct PosTag: + Morphology morph + int id + univ_tag_t pos + + +cdef class Morphologizer: + cdef Pool mem + cdef StringStore strings + cdef object lemmatizer + cdef PosTag* tags + cdef readonly list tag_names + + cdef PreshMapArray _cache + cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 + cdef int set_morph(self, const int i, TokenC* tokens) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx new file mode 100644 index 000000000..346c778a9 --- /dev/null +++ b/spacy/morphology.pyx @@ -0,0 +1,117 @@ +# cython: profile=True +# cython: embedsignature=True +from os import path +import json + +from .lemmatizer import Lemmatizer +from .typedefs cimport id_t + +UNIV_TAGS = { + 'NULL': NO_TAG, + 'ADJ': ADJ, + 'ADV': ADV, + 'ADP': ADP, + 'CONJ': CONJ, + 'DET': DET, + 'NOUN': NOUN, + 'NUM': NUM, + 'PRON': PRON, + 'PRT': PRT, + 'VERB': VERB, + 'X': X, + '.': PUNCT, + 'EOL': EOL +} + + +cdef struct _Cached: + Morphology morph + int lemma + + +cdef class Morphologizer: + """Given a POS tag and a Lexeme, find its lemma and morphological analysis. + """ + def __init__(self, StringStore strings, data_dir): + self.mem = Pool() + self.strings = strings + cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) + tag_map = cfg['tag_map'] + self.tag_names = cfg['tag_names'] + self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet')) + self._cache = PreshMapArray(len(self.tag_names)) + self.tags = self.mem.alloc(len(self.tag_names), sizeof(PosTag)) + for i, tag in enumerate(self.tag_names): + pos, props = tag_map[tag] + self.tags[i].id = i + self.tags[i].pos = pos + self.tags[i].morph.number = props.get('number', 0) + self.tags[i].morph.tenspect = props.get('tenspect', 0) + self.tags[i].morph.mood = props.get('mood', 0) + self.tags[i].morph.gender = props.get('gender', 0) + self.tags[i].morph.person = props.get('person', 0) + self.tags[i].morph.case = props.get('case', 0) + self.tags[i].morph.misc = props.get('misc', 0) + if path.exists(path.join(data_dir, 'morphs.json')): + with open(path.join(data_dir, 'morphs.json')) as file_: + self.load_exceptions(json.load(file_)) + + cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: + if self.lemmatizer is None: + return lex.sic + if pos != NOUN and pos != VERB and pos != ADJ: + return lex.sic + cdef bytes py_string = self.strings[lex.sic] + cdef set lemma_strings + cdef bytes lemma_string + if pos == NOUN: + lemma_strings = self.lemmatizer.noun(py_string) + elif pos == VERB: + lemma_strings = self.lemmatizer.verb(py_string) + else: + assert pos == ADJ + lemma_strings = self.lemmatizer.adj(py_string) + lemma_string = sorted(lemma_strings)[0] + lemma = self.strings.intern(lemma_string, len(lemma_string)).i + return lemma + + cdef int set_morph(self, const int i, TokenC* tokens) except -1: + cdef const PosTag* tag = &self.tags[tokens[i].pos] + cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic) + if cached is NULL: + cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) + cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) + cached.morph = tag.morph + self._cache.set(tag.id, tokens[i].lex.sic, cached) + + tokens[i].lemma = cached.lemma + tokens[i].morph = cached.morph + + def load_exceptions(self, dict exc): + cdef unicode pos_str + cdef unicode form_str + cdef unicode lemma_str + cdef dict entries + cdef dict props + cdef int lemma + cdef id_t sic + cdef univ_tag_t pos + for pos_str, entries in exc.items(): + pos = self.tag_names.index(pos_str) + for form_str, props in entries.items(): + lemma_str = props.get('L', form_str) + sic = self.strings[form_str] + cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) + cached.lemma = self.strings[lemma_str] + set_morph_from_dict(&cached.morph, props) + self._cache.set(pos, sic, cached) + + +cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: + morph.number = props.get('number', 0) + morph.tenspect = props.get('tenspect', 0) + morph.mood = props.get('mood', 0) + morph.gender = props.get('gender', 0) + morph.person = props.get('person', 0) + morph.case = props.get('case', 0) + morph.misc = props.get('misc', 0) diff --git a/spacy/orth.py b/spacy/orth.py index 0462d15df..2400b38a6 100644 --- a/spacy/orth.py +++ b/spacy/orth.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import unicodedata from unidecode import unidecode +import re import math diff --git a/spacy/pos_util.py b/spacy/pos_util.py index e5716665e..489f03dde 100644 --- a/spacy/pos_util.py +++ b/spacy/pos_util.py @@ -147,6 +147,7 @@ Y PRT Z NOUN ^ NOUN ~ X -`` .""".strip().split('\n')) +`` . +EOL EOL""".strip().split('\n')) return mapping[tag] diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 11d8d2a4c..33732f987 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -1,34 +1,23 @@ +from libc.stdint cimport uint8_t + from cymem.cymem cimport Pool from thinc.learner cimport LinearModel from thinc.features cimport Extractor from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t -from .typedefs cimport hash_t -from .context cimport Slots +from preshed.maps cimport PreshMapArray + +from .typedefs cimport hash_t, id_t from .tokens cimport Tokens -cpdef enum TagType: - POS - ENTITY - SENSE - - cdef class Tagger: - cpdef int set_tags(self, Tokens tokens) except -1 - cpdef class_t predict(self, int i, Tokens tokens) except 0 - cpdef int tell_answer(self, list gold) except -1 + cdef class_t predict(self, const atom_t* context, object golds=*) except * cpdef readonly Pool mem cpdef readonly Extractor extractor cpdef readonly LinearModel model - cpdef readonly TagType tag_type cpdef readonly list tag_names - - cdef class_t _guess - cdef atom_t* _context - cdef feat_t* _feats - cdef weight_t* _values - cdef weight_t* _scores + cdef dict tagdict diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 428814f70..9890e95e1 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -1,5 +1,4 @@ # cython: profile=True -from __future__ import print_function from __future__ import unicode_literals from __future__ import division @@ -10,155 +9,59 @@ import random import json import cython - -from .context cimport fill_context -from .context cimport N_FIELDS - -from thinc.features cimport ConjFeat +from thinc.features cimport Feature, count_feats -NULL_TAG = 0 - - -def setup_model_dir(tag_type, tag_names, templates, model_dir): +def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir): if path.exists(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) config = { - 'tag_type': tag_type, 'templates': templates, 'tag_names': tag_names, + 'tag_map': tag_map, + 'tag_counts': tag_counts, } with open(path.join(model_dir, 'config.json'), 'w') as file_: json.dump(config, file_) -def train(train_sents, model_dir, nr_iter=10): - cdef Tokens tokens - tagger = Tagger(model_dir) - for _ in range(nr_iter): - n_corr = 0 - total = 0 - for tokens, golds in train_sents: - assert len(tokens) == len(golds), [t.string for t in tokens] - for i in range(tokens.length): - if tagger.tag_type == POS: - gold = _get_gold_pos(i, golds, tokens.pos) - elif tagger.tag_type == ENTITY: - gold = _get_gold_ner(i, golds, tokens.ner) - guess = tagger.predict(i, tokens) - tokens.set_tag(i, tagger.tag_type, guess) - if gold is not None: - tagger.tell_answer(gold) - total += 1 - n_corr += guess in gold - #print('%s\t%d\t%d' % (tokens[i].string, guess, gold)) - print('%.4f' % ((n_corr / total) * 100)) - random.shuffle(train_sents) - tagger.model.end_training() - tagger.model.dump(path.join(model_dir, 'model')) - - -cdef object _get_gold_pos(i, golds, int* pred): - if golds[i] == 0: - return None - else: - return [golds[i]] - - -cdef object _get_gold_ner(i, golds, int* ner): - if golds[i] == 0: - return None - else: - return [golds[i]] - - -def evaluate(tagger, sents): - n_corr = 0 - total = 0 - for tokens, golds in sents: - for i, gold in enumerate(golds): - guess = tagger.predict(i, tokens) - tokens.set_tag(i, tagger.tag_type, guess) - if gold != NULL_TAG: - total += 1 - n_corr += guess == gold - return n_corr / total - - cdef class Tagger: - """Assign part-of-speech, named entity or supersense tags, using greedy - decoding. The tagger reads its model and configuration from disk. + """Predict some type of tag, using greedy decoding. The tagger reads its + model and configuration from disk. """ def __init__(self, model_dir): self.mem = Pool() cfg = json.load(open(path.join(model_dir, 'config.json'))) templates = cfg['templates'] + univ_counts = {} + cdef unicode tag + cdef unicode univ_tag self.tag_names = cfg['tag_names'] - self.tag_type = cfg['tag_type'] - self.extractor = Extractor(templates, [ConjFeat] * len(templates)) - self.model = LinearModel(len(self.tag_names)) + self.tagdict = _make_tag_dict(cfg['tag_counts']) + self.extractor = Extractor(templates) + self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2) if path.exists(path.join(model_dir, 'model')): self.model.load(path.join(model_dir, 'model')) - self._context = self.mem.alloc(N_FIELDS, sizeof(atom_t)) - self._feats = self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) - self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) - self._scores = self.mem.alloc(self.model.nr_class, sizeof(weight_t)) - self._guess = NULL_TAG - - cpdef int set_tags(self, Tokens tokens) except -1: - """Assign tags to a Tokens object. - - >>> tokens = EN.tokenize(u'An example sentence.') - >>> assert tokens[0].pos == 'NO_TAG' - >>> EN.pos_tagger.set_tags(tokens) - >>> assert tokens[0].pos == 'DT' - """ - cdef int i - for i in range(tokens.length): - tokens.set_tag(i, self.tag_type, self.predict(i, tokens)) - - cpdef class_t predict(self, int i, Tokens tokens) except 0: - """Predict the tag of tokens[i]. The tagger remembers the features and - prediction, in case you later call tell_answer. + cdef class_t predict(self, atom_t* context, object golds=None) except *: + """Predict the tag of tokens[i]. >>> tokens = EN.tokenize(u'An example sentence.') >>> tag = EN.pos_tagger.predict(0, tokens) >>> assert tag == EN.pos_tagger.tag_id('DT') == 5 """ - fill_context(self._context, i, tokens) - self.extractor.extract(self._feats, self._values, self._context, NULL) - self._guess = self.model.score(self._scores, self._feats, self._values) - return self._guess - - cpdef int tell_answer(self, list golds) except -1: - """Provide the correct tag for the word the tagger was last asked to predict. - During Tagger.predict, the tagger remembers the features and prediction - for the example. These are used to calculate a weight update given the - correct label. - - >>> tokens = EN.tokenize('An example sentence.') - >>> guess = EN.pos_tagger.predict(1, tokens) - >>> JJ = EN.pos_tagger.tag_id('JJ') - >>> JJ - 7 - >>> EN.pos_tagger.tell_answer(JJ) - """ - cdef class_t guess = self._guess - if guess in golds: - self.model.update({}) - return 0 - best_gold = golds[0] - best_score = self._scores[best_gold-1] - for gold in golds[1:]: - if self._scores[gold-1] > best_gold: - best_score = self._scores[best_gold-1] - best_gold = gold - counts = {guess: {}, best_gold: {}} - self.extractor.count(counts[best_gold], self._feats, 1) - self.extractor.count(counts[guess], self._feats, -1) - self.model.update(counts) + cdef int n_feats + cdef Feature* feats = self.extractor.get_feats(context, &n_feats) + cdef weight_t* scores = self.model.get_scores(feats, n_feats) + guess = _arg_max(scores, self.model.nr_class) + if golds is not None and guess not in golds: + best = _arg_max_among(scores, golds) + counts = {guess: {}, best: {}} + count_feats(counts[guess], feats, n_feats, -1) + count_feats(counts[best], feats, n_feats, 1) + self.model.update(counts) + return guess def tag_id(self, object tag_name): """Encode tag_name into a tag ID integer.""" @@ -167,3 +70,41 @@ cdef class Tagger: tag_id = len(self.tag_names) self.tag_names.append(tag_name) return tag_id + + +def _make_tag_dict(counts): + freq_thresh = 20 + ambiguity_thresh = 0.97 + tagdict = {} + cdef atom_t word + cdef atom_t tag + for word_str, tag_freqs in counts.items(): + tag_str, mode = max(tag_freqs.items(), key=lambda item: item[1]) + n = sum(tag_freqs.values()) + word = int(word_str) + tag = int(tag_str) + if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: + tagdict[word] = tag + return tagdict + + +cdef class_t _arg_max(weight_t* scores, int n_classes) except 9000: + cdef int best = 0 + cdef weight_t score = scores[best] + cdef int i + for i in range(1, n_classes): + if scores[i] >= score: + score = scores[i] + best = i + return best + + +cdef class_t _arg_max_among(weight_t* scores, list classes): + cdef int best = classes[0] + cdef weight_t score = scores[best] + cdef class_t clas + for clas in classes: + if scores[clas] > score: + score = scores[clas] + best = clas + return best diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index d1b2ef10b..43aa7b442 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -1,40 +1,55 @@ +import numpy as np +cimport numpy as np + from cymem.cymem cimport Pool +from thinc.typedefs cimport atom_t from .lexeme cimport Lexeme -from .typedefs cimport flag_t -from .utf8string cimport StringStore -from .tagger cimport TagType -from thinc.typedefs cimport atom_t +from .typedefs cimport flags_t +from .typedefs cimport Morphology +from .lang cimport Language + + + +cdef struct TokenC: + const Lexeme* lex + Morphology morph + int idx + int pos + int lemma + int sense + + +ctypedef const Lexeme* const_Lexeme_ptr +ctypedef TokenC* TokenC_ptr + +ctypedef fused LexemeOrToken: + const_Lexeme_ptr + TokenC_ptr cdef class Tokens: cdef Pool mem - cdef StringStore _string_store + cdef Language lang + cdef list tag_names - cdef Lexeme** _lex_ptr - cdef int* _idx_ptr - cdef int* _pos_ptr - cdef int* _ner_ptr - cdef Lexeme** lex - cdef int* idx - cdef int* pos - cdef int* ner + cdef TokenC* data cdef int length cdef int max_length - cdef int extend(self, int i, Lexeme** lexemes, int n) except -1 - cdef int push_back(self, int i, Lexeme* lexeme) except -1 - cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1 + cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 + + cpdef np.ndarray[long, ndim=2] get_array(self, list features) cdef class Token: - cdef StringStore _string_store + cdef public Language lang cdef public int i cdef public int idx - cdef public int pos - cdef public int ner + cdef int pos + cdef int lemma cdef public atom_t id cdef public atom_t cluster @@ -51,4 +66,4 @@ cdef class Token: cdef public float prob - cdef public flag_t flags + cdef public flags_t flags diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 721e6bb80..617feb269 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,7 +1,15 @@ # cython: profile=True +from preshed.maps cimport PreshMap +from preshed.counter cimport PreshCounter + from .lexeme cimport * cimport cython -from .tagger cimport POS, ENTITY + +import numpy as np +cimport numpy as np + +POS = 0 +ENTITY = 0 DEF PADDING = 5 @@ -17,23 +25,13 @@ cdef class Tokens: """A sequence of references to Lexeme objects. The Tokens class provides fast and memory-efficient access to lexical features, - and can efficiently export the data to a numpy array. Specific languages - create their own Tokens subclasses, to provide more convenient access to - language-specific features. + and can efficiently export the data to a numpy array. >>> from spacy.en import EN >>> tokens = EN.tokenize('An example sentence.') - >>> tokens.string(0) - 'An' - >>> tokens.prob(0) > tokens.prob(1) - True - >>> tokens.can_noun(0) - False - >>> tokens.can_noun(1) - True """ - def __init__(self, StringStore string_store, string_length=0): - self._string_store = string_store + def __init__(self, Language lang, string_length=0): + self.lang = lang if string_length >= 3: size = int(string_length / 3.0) else: @@ -42,28 +40,18 @@ cdef class Tokens: # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # However, we need to remember the true starting places, so that we can # realloc. - self._lex_ptr = self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*)) - self._idx_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) - self._pos_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) - self._ner_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) - self.lex = self._lex_ptr - self.idx = self._idx_ptr - self.pos = self._pos_ptr - self.ner = self._ner_ptr + data_start = self.mem.alloc(size + (PADDING*2), sizeof(TokenC)) cdef int i for i in range(size + (PADDING*2)): - self.lex[i] = &EMPTY_LEXEME - self.lex += PADDING - self.idx += PADDING - self.pos += PADDING - self.ner += PADDING + data_start[i].lex = &EMPTY_LEXEME + self.data = data_start + PADDING self.max_length = size self.length = 0 def __getitem__(self, i): bounds_check(i, self.length, PADDING) - return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i], - self.lex[i][0]) + return Token(self.lang, i, self.data[i].idx, self.data[i].pos, + self.data[i].lemma, self.data[i].lex[0]) def __iter__(self): for i in range(self.length): @@ -72,70 +60,78 @@ cdef class Tokens: def __len__(self): return self.length - cdef int push_back(self, int idx, Lexeme* lexeme) except -1: + cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: if self.length == self.max_length: self._realloc(self.length * 2) - self.lex[self.length] = lexeme - self.idx[self.length] = idx - self.pos[self.length] = 0 - self.ner[self.length] = 0 - self.length += 1 - return idx + lexeme.length - - cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1: - cdef int i - if lexemes == NULL: - return idx - elif n == 0: - i = 0 - while lexemes[i] != NULL: - idx = self.push_back(idx, lexemes[i]) - i += 1 + cdef TokenC* t = &self.data[self.length] + if LexemeOrToken is TokenC_ptr: + t[0] = lex_or_tok[0] else: - for i in range(n): - idx = self.push_back(idx, lexemes[i]) - return idx + t.lex = lex_or_tok + self.length += 1 + return idx + t.lex.length - cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1: - if tag_type == POS: - self.pos[i] = tag - elif tag_type == ENTITY: - self.ner[i] = tag + @cython.boundscheck(False) + cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids): + cdef int i, j + cdef attr_id_t feature + cdef np.ndarray[long, ndim=2] output + output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int) + for i in range(self.length): + for j, feature in enumerate(attr_ids): + output[i, j] = get_attr(self.data[i].lex, feature) + return output + + def count_by(self, attr_id_t attr_id): + cdef int i + cdef attr_t attr + cdef size_t count + + cdef PreshCounter counts = PreshCounter(2 ** 8) + for i in range(self.length): + if attr_id == LEMMA: + attr = self.data[i].lemma + else: + attr = get_attr(self.data[i].lex, attr_id) + counts.inc(attr, 1) + return dict(counts) def _realloc(self, new_size): self.max_length = new_size n = new_size + (PADDING * 2) - self._lex_ptr = self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*)) - self._idx_ptr = self.mem.realloc(self._idx_ptr, n * sizeof(int)) - self._pos_ptr = self.mem.realloc(self._pos_ptr, n * sizeof(int)) - self._ner_ptr = self.mem.realloc(self._ner_ptr, n * sizeof(int)) - self.lex = self._lex_ptr + PADDING - self.idx = self._idx_ptr + PADDING - self.pos = self._pos_ptr + PADDING - self.ner = self._ner_ptr + PADDING + # What we're storing is a "padded" array. We've jumped forward PADDING + # places, and are storing the pointer to that. This way, we can access + # words out-of-bounds, and get out-of-bounds markers. + # Now that we want to realloc, we need the address of the true start, + # so we jump the pointer back PADDING places. + cdef TokenC* data_start = self.data - PADDING + data_start = self.mem.realloc(data_start, n * sizeof(TokenC)) + self.data = data_start + PADDING + cdef int i for i in range(self.length, self.max_length + PADDING): - self.lex[i] = &EMPTY_LEXEME + self.data[i].lex = &EMPTY_LEXEME @cython.freelist(64) cdef class Token: - def __init__(self, StringStore string_store, int i, int idx, int pos, int ner, - dict lex): - self._string_store = string_store + def __init__(self, Language lang, int i, int idx, + int pos, int lemma, dict lex): + self.lang = lang self.idx = idx self.pos = pos - self.ner = ner self.i = i self.id = lex['id'] + + self.lemma = lemma self.cluster = lex['cluster'] self.length = lex['length'] - self.postype = lex['postype'] - self.sensetype = lex['supersense'] + self.postype = lex['pos_type'] + self.sensetype = 0 self.sic = lex['sic'] - self.norm = lex['norm'] + self.norm = lex['dense'] self.shape = lex['shape'] - self.suffix = lex['asciied'] + self.suffix = lex['suffix'] self.prefix = lex['prefix'] self.prob = lex['prob'] @@ -145,5 +141,16 @@ cdef class Token: def __get__(self): if self.sic == 0: return '' - cdef bytes utf8string = self._string_store[self.sic] + cdef bytes utf8string = self.lang.lexicon.strings[self.sic] return utf8string.decode('utf8') + + property lemma: + def __get__(self): + if self.lemma == 0: + return self.string + cdef bytes utf8string = self.lang.lexicon.strings[self.lemma] + return utf8string.decode('utf8') + + property pos: + def __get__(self): + return self.lang.pos_tagger.tag_names[self.pos] diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 21818f05e..02d327b72 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -1,8 +1,20 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t +from libc.stdint cimport uint8_t ctypedef uint64_t hash_t ctypedef char* utf8_t -ctypedef uint64_t flag_t +ctypedef uint32_t attr_t +ctypedef uint64_t flags_t ctypedef uint32_t id_t ctypedef uint16_t len_t ctypedef uint16_t tag_t + + +cdef struct Morphology: + uint8_t number + uint8_t tenspect # Tense/aspect/voice + uint8_t mood + uint8_t gender + uint8_t person + uint8_t case + uint8_t misc diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd index 82ae50022..5ef4113d5 100644 --- a/spacy/utf8string.pxd +++ b/spacy/utf8string.pxd @@ -1,5 +1,6 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool +from murmurhash.mrmr cimport hash64 from .typedefs cimport utf8_t, id_t, hash_t @@ -11,11 +12,23 @@ cdef struct Utf8Str: int length +cdef struct UniStr: + Py_UNICODE* chars + size_t n + hash_t key + + +cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil: + s.chars = &chars[start] + s.n = end - start + s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) + + cdef class StringStore: cdef Pool mem - cdef PreshMap table + cdef PreshMap _map cdef Utf8Str* strings cdef int size cdef int _resize_at - cdef Utf8Str* intern(self, char* chars, int length) except NULL + cdef const Utf8Str* intern(self, char* chars, int length) except NULL diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx index 18d4a4e5e..1d2b7a264 100644 --- a/spacy/utf8string.pyx +++ b/spacy/utf8string.pyx @@ -5,10 +5,11 @@ import codecs SEPARATOR = '\n|-SEP-|\n' + cdef class StringStore: def __init__(self): self.mem = Pool() - self.table = PreshMap() + self._map = PreshMap() self._resize_at = 10000 self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.size = 1 @@ -17,26 +18,30 @@ cdef class StringStore: def __get__(self): return self.size-1 - def __getitem__(self, string_or_id): + def __getitem__(self, object string_or_id): cdef bytes byte_string - cdef Utf8Str* utf8str - if type(string_or_id) == int or type(string_or_id) == long: + cdef const Utf8Str* utf8str + if isinstance(string_or_id, int) or isinstance(string_or_id, long): if string_or_id < 1 or string_or_id >= self.size: raise IndexError(string_or_id) utf8str = &self.strings[string_or_id] return utf8str.chars[:utf8str.length] - elif type(string_or_id) == bytes: + elif isinstance(string_or_id, bytes): utf8str = self.intern(string_or_id, len(string_or_id)) return utf8str.i + elif isinstance(string_or_id, unicode): + byte_string = string_or_id.encode('utf8') + utf8str = self.intern(byte_string, len(byte_string)) + return utf8str.i else: raise TypeError(type(string_or_id)) - cdef Utf8Str* intern(self, char* chars, int length) except NULL: + cdef const Utf8Str* intern(self, char* chars, int length) except NULL: # 0 means missing, but we don't bother offsetting the index. We waste # slot 0 to simplify the code, because it doesn't matter. assert length != 0 cdef hash_t key = hash64(chars, length * sizeof(char), 0) - cdef void* value = self.table.get(key) + cdef void* value = self._map.get(key) cdef size_t i if value == NULL: if self.size == self._resize_at: @@ -48,7 +53,7 @@ cdef class StringStore: self.strings[i].chars = self.mem.alloc(length, sizeof(char)) memcpy(self.strings[i].chars, chars, length) self.strings[i].length = length - self.table.set(key, self.size) + self._map.set(key, self.size) self.size += 1 else: i = value diff --git a/spacy/util.py b/spacy/util.py index 5062ca6db..1c25aeaf2 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -13,7 +13,8 @@ def utf8open(loc, mode='r'): def read_lang_data(name): data_dir = path.join(DATA_DIR, name) - tokenization = read_tokenization(name) + with open(path.join(data_dir, 'specials.json')) as file_: + tokenization = ujson.load(file_) prefix = read_prefix(data_dir) suffix = read_suffix(data_dir) infix = read_infix(data_dir) @@ -26,12 +27,14 @@ def read_prefix(data_dir): expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return expression + def read_suffix(data_dir): - with utf8open(path.join(data_dir, 'suffix')) as file_: + with utf8open(path.join(data_dir, 'suffix')) as file_: entries = file_.read().split('\n') - expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()]) + expression = '|'.join([piece + '$' for piece in entries if piece.strip()]) return expression + def read_infix(data_dir): with utf8open(path.join(data_dir, 'infix')) as file_: entries = file_.read().split('\n') diff --git a/tests/test_ner.py b/tests/depr_test_ner.py similarity index 100% rename from tests/test_ner.py rename to tests/depr_test_ner.py diff --git a/tests/test_contractions.py b/tests/test_contractions.py index b7347a617..1e697afd2 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -20,15 +20,18 @@ def test_apostrophe(): def test_LL(): tokens = EN.tokenize("we'll") assert len(tokens) == 2 - assert tokens[1].string == "will" + assert tokens[1].string == "'ll" + assert tokens[1].lemma == "will" assert tokens[0].string == "we" def test_aint(): tokens = EN.tokenize("ain't") assert len(tokens) == 2 - assert tokens[0].string == "are" - assert tokens[1].string == "not" + assert tokens[0].string == "ai" + assert tokens[0].lemma == "be" + assert tokens[1].string == "n't" + assert tokens[1].lemma == "not" def test_capitalized(): @@ -38,4 +41,12 @@ def test_capitalized(): assert len(tokens) == 2 tokens = EN.tokenize("Ain't") assert len(tokens) == 2 - assert tokens[0].string == "Are" + assert tokens[0].string == "Ai" + assert tokens[0].lemma == "be" + + +def test_punct(): + tokens = EN.tokenize("We've") + assert len(tokens) == 2 + tokens = EN.tokenize("``We've") + assert len(tokens) == 3 diff --git a/tests/test_emoticons.py b/tests/test_emoticons.py index 6bb58e661..143be607d 100644 --- a/tests/test_emoticons.py +++ b/tests/test_emoticons.py @@ -27,3 +27,9 @@ def test_tweebo_challenge(): assert tokens[19].string == '")' assert tokens[20].string == ':>' assert tokens[21].string == '....' + + +def test_false_positive(): + text = "example:)" + tokens = EN.tokenize(text) + assert len(tokens) == 3 diff --git a/tests/test_intern.py b/tests/test_intern.py index 63b4b3433..a7a801b05 100644 --- a/tests/test_intern.py +++ b/tests/test_intern.py @@ -19,8 +19,12 @@ def test_save_bytes(sstore): def test_save_unicode(sstore): - with pytest.raises(TypeError): - A_i = sstore['A'] + Hello_i = sstore[u'Hello'] + assert Hello_i == 1 + assert sstore[u'Hello'] == 1 + assert sstore[u'goodbye'] != Hello_i + assert sstore[u'hello'] != Hello_i + assert Hello_i == 1 def test_zero_id(sstore): diff --git a/tests/test_iter_lexicon.py b/tests/test_iter_lexicon.py new file mode 100644 index 000000000..379ebd3bb --- /dev/null +++ b/tests/test_iter_lexicon.py @@ -0,0 +1,15 @@ +import pytest + +from spacy.en import EN + +def test_range_iter(): + EN.load() + for i in range(len(EN.lexicon)): + lex = EN.lexicon[i] + + +def test_iter(): + EN.load() + i = 0 + for lex in EN.lexicon: + i += 1 diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py new file mode 100644 index 000000000..2047e4d2c --- /dev/null +++ b/tests/test_lemmatizer.py @@ -0,0 +1,34 @@ +from spacy.lemmatizer import Lemmatizer, read_index, read_exc +from spacy.util import DATA_DIR +from os import path + +import pytest + + +def test_read_index(): + wn = path.join(DATA_DIR, 'wordnet') + index = read_index(path.join(wn, 'index.noun')) + assert 'man' in index + assert 'plantes' not in index + assert 'plant' in index + + +def test_read_exc(): + wn = path.join(DATA_DIR, 'wordnet') + exc = read_exc(path.join(wn, 'verb.exc')) + assert exc['was'] == ('be',) + + +@pytest.fixture +def lemmatizer(): + return Lemmatizer(path.join(DATA_DIR, 'wordnet')) + + +def test_noun_lemmas(lemmatizer): + do = lemmatizer.noun + + assert do('aardwolves') == set(['aardwolf']) + assert do('aardwolf') == set(['aardwolf']) + assert do('planets') == set(['planet']) + assert do('ring') == set(['ring']) + assert do('axes') == set(['axis', 'axe', 'ax']) diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py index 10276d8ea..c1fe2d847 100644 --- a/tests/test_lexeme_flags.py +++ b/tests/test_lexeme_flags.py @@ -7,6 +7,7 @@ from spacy.lexeme import * def test_is_alpha(): + EN.load() the = EN.lexicon['the'] assert the['flags'] & (1 << IS_ALPHA) year = EN.lexicon['1999'] @@ -16,6 +17,7 @@ def test_is_alpha(): def test_is_digit(): + EN.load() the = EN.lexicon['the'] assert not the['flags'] & (1 << IS_DIGIT) year = EN.lexicon['1999'] diff --git a/tests/test_rules.py b/tests/test_rules.py deleted file mode 100644 index b19a1c3f1..000000000 --- a/tests/test_rules.py +++ /dev/null @@ -1,11 +0,0 @@ -from spacy import util - - -def test_load_en(): - rules = util.read_tokenization('en') - assert len(rules) != 0 - aint = [rule for rule in rules if rule[0] == "ain't"][0] - chunk, pieces = aint - assert chunk == "ain't" - assert pieces[0] == "are" - assert pieces[1] == "not" diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index fb5f78ed7..21d115b9b 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -34,7 +34,7 @@ def test_digits(): def test_contraction(): tokens = EN.tokenize("don't giggle") assert len(tokens) == 3 - assert tokens[1].sic == EN.lexicon["not"]['sic'] + assert tokens[1].sic == EN.lexicon["n't"]['sic'] tokens = EN.tokenize("i said don't!") assert len(tokens) == 5 assert tokens[4].sic == EN.lexicon['!']['sic'] @@ -71,30 +71,39 @@ def test_cnts1(): tokens = EN.tokenize(text) assert len(tokens) == 8 + def test_cnts2(): text = u"""U.N. regulations are not a part of their concern.""" tokens = EN.tokenize(text) assert len(tokens) == 10 + def test_cnts3(): text = u"“Isn't it?”" tokens = EN.tokenize(text) - assert len(tokens) == 6 + words = [t.string for t in tokens] + assert len(words) == 6 + def test_cnts4(): text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ tokens = EN.tokenize(text) - assert len(tokens) == 15 + words = [t.string for t in tokens] + assert len(words) == 15 + def test_cnts5(): text = """'Me too!', Mr. P. Delaware cried. """ tokens = EN.tokenize(text) assert len(tokens) == 11 + def test_cnts6(): text = u'They ran about 10km.' tokens = EN.tokenize(text) - assert len(tokens) == 6 + words = [t.string for t in tokens] + assert len(words) == 6 + #def test_cnts7(): # text = 'But then the 6,000-year ice age came...'