diff --git a/data/en/prefix b/data/en/prefix
index 64a3f1f2f..cb9bb4d7b 100644
--- a/data/en/prefix
+++ b/data/en/prefix
@@ -11,3 +11,8 @@ $
'
``
`
+#
+US$
+C$
+A$
+a-
diff --git a/data/en/suffix b/data/en/suffix
index 77400d0fd..8ba48296d 100644
--- a/data/en/suffix
+++ b/data/en/suffix
@@ -1,13 +1,13 @@
,
-"
-)
-]
-}
-*
-!
-?
+\"
+\)
+\]
+\}
+\*
+\!
+\?
%
-$
+\$
>
:
;
@@ -16,7 +16,8 @@ $
''
's
'S
-.
-..
-...
-....
+\.\.
+\.\.\.
+\.\.\.\.
+(?<=[a-z0-9])\.
+(?<=[0-9])km
diff --git a/data/en/tokenization b/data/en/tokenization
index 6bf0d738b..382b7e383 100644
--- a/data/en/tokenization
+++ b/data/en/tokenization
@@ -4,101 +4,9 @@
#*---* ---
#*'s 's
-'s 's
-'S 'S
-ain't are not
-aren't are not
-can't can not
-cannot can not
-could've could have
-couldn't could not
-couldn't've could not have
-didn't did not
-doesn't does not
-don't do not
-hadn't had not
-hadn't've had not have
-hasn't has not
-haven't have not
-he'd he would
-he'd've he would have
-he'll he will
-he's he 's
-how'd he would
-how'll he will
-how's how 's
-I'd I would
-I'd've I would have
-I'll I will
-I'm I am
-I'ma I will
-I've I have
-isn't is not
-it'd it would
-it'd've it would have
-it'll it will
-it's it 's
-let's let 's
-mightn't might not
-mightn't've might not have
-might've might have
-mustn't must not
-must've must have
-needn't need not
-not've not have
-shan't shall not
-she'd she would
-she'd've she would have
-she'll she will
-she's she 's
-should've should have
-shouldn't should not
-shouldn't've should not have
-that's that 's
-there'd there would
-there'd've there would have
-there's there is
-they'd there would
-they'd've they would have
-they'll they will
-they're they are
-they've they have
-wasn't was not
-we'd we would
-we'd've we would have
-we'll we will
-we're we are
-we've we have
-weren't were not
-what'll what will
-what're what are
-what's what 's
-what've what have
-when's when 's
-where'd where would
-where's where 's
-where've where have
-who'd who would
-who'll who will
-who're who are
-who's who 's
-who've who have
-why'll who will
-why're why are
-why's why 's
-won't will not
-would've would have
-wouldn't would not
-wouldn't've would not have
-you'd you would
-you'd've you would have
-you'll you will
-you're you are
-you've you have
-'em them
-'ol old
10km 10 km
U.S. U.S.
+U.K. U.K.
non-U.S. non-U.S.
U.N. U.N.
Co. Co.
@@ -115,7 +23,12 @@ A.G. A.G.
Rep. Rep.
Ms. Ms.
Mr. Mr.
+Mrs. Mrs.
a.m. a.m.
+Sen. Sen.
+INC. INC.
+CO. CO.
+COS. COS.
p.m. p.m.
Nos. Nos.
a.k.a. a.k.a.
@@ -127,6 +40,7 @@ E. E.
F. F.
G. G.
H. H.
+I. I.
J. J.
K. K.
L. L.
@@ -205,6 +119,9 @@ Wash. Wash.
W.Va. W.Va.
Wis. Wis.
Wyo. Wyo.
+L.A. L.A.
+R.H. R.H.
+Gov. Gov.
'' ''
:) :)
<3 <3
@@ -262,3 +179,19 @@ V_V V_V
o.O o.O
") ")
.... ....
+a- a -
+Messrs. Messrs.
+No. No.
+vs. vs.
+Gen. Gen.
+Cos. Cos.
+L.J. L.J.
+D.T. D.T.
+Prof. Prof.
+Bros. Bros.
+J.C. J.C.
+Neb. Neb.
+Adm. Adm.
+U.S.S.R. U.S.S.R.
+Rev. Rev.
+H.F. H.F.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 97681bfd8..fb738aa32 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -3,45 +3,228 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
+================================
spaCy NLP Tokenizer and Lexicon
================================
-spaCy is a library for industrial strength NLP in Python. Its core
-values are:
+spaCy is a library for industrial-strength NLP in Python and Cython. spaCy's
+take on NLP is that it's mostly about feature extraction --- that's the part
+that's specific to NLP, so that's what an NLP library should focus on.
-* **Efficiency**: You won't find faster NLP tools. For shallow analysis, it's 10x
- faster than Stanford Core NLP, and over 200x faster than NLTK. Its parser is
- over 100x faster than Stanford's.
+spaCy also believes that for NLP, **efficiency is critical**. If you're
+running batch jobs, you probably have an enormous amount of data; if you're
+serving requests one-by-one, you want lower latency and fewer servers. Even if
+you're doing exploratory research on relatively small samples, you should still
+value efficiency, because it means you can run more experiments.
-* **Accuracy**: All spaCy tools are within 0.5% of the current published
- state-of-the-art, on both news and web text. NLP moves fast, so always check
- the numbers --- and don't settle for tools that aren't backed by
- rigorous recent evaluation.
+Depending on the task, spaCy is between 10 and 200 times faster than NLTK,
+often with much better accuracy. See Benchmarks for details, and
+Why is spaCy so fast? for a discussion of the algorithms and implementation
+that makes this possible.
-* **Minimalism**: This isn't a library that covers 43 known algorithms to do X. You
- get 1 --- the best one --- with a simple, low-level interface. This keeps the
- code-base small and concrete. Our Python APIs use lists and
- dictionaries, and our C/Cython APIs use arrays and simple structs.
++---------+----------+-------------+----------+
+| System | Tokenize | --> Counts | --> Stem |
++---------+----------+-------------+----------+
+| spaCy | 1m42s | 1m59s | 1m59s |
++---------+----------+-------------+----------+
+| NLTK | 20m2s | 28m24s | 52m28 |
++---------+----------+-------------+----------+
+
+Times for 100m words of text.
+
+
+Unique Lexicon-centric design
+=============================
+
+spaCy helps you build models that generalise better, by making it easy to use
+more robust features. Instead of a list of strings, the tokenizer returns
+references to rich lexical types. Features which ask about the word's Brown cluster,
+its typical part-of-speech tag, how it's usually cased etc require no extra effort:
+
+ >>> from spacy.en import EN
+ >>> from spacy.feature_names import *
+ >>> feats = (
+ SIC, # ID of the original word form
+ STEM, # ID of the stemmed word form
+ CLUSTER, # ID of the word's Brown cluster
+ IS_TITLE, # Was the word title-cased?
+ POS_TYPE # A cluster ID describing what POS tags the word is usually assigned
+ )
+ >>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^')
+ >>> tokens.to_array(feats)[:5]
+ array([[ 1, 2, 3, 4],
+ [...],
+ [...],
+ [...]])
+
+
+spaCy is designed to **make the right thing easy**, where the right thing is to:
+
+* **Use rich distributional and orthographic features**. Without these, your model
+ will be very brittle and domain dependent.
+
+* **Compute features per type, not per token**. Because of Zipf's law, you can
+ expect this to be exponentially more efficient.
+
+* **Minimize string processing**, and instead compute with arrays of ID ints.
+For the current list of lexical features, see `Lexical Features`_.
-Comparison
-----------
+.. _lexical features: features.html
-+----------------+-------------+--------+---------------+--------------+
-| Tokenize & Tag | Speed (w/s) | Memory | % Acc. (news) | % Acc. (web) |
-+----------------+-------------+--------+---------------+--------------+
-| spaCy | 107,000 | 1.3gb | 96.7 | |
-+----------------+-------------+--------+---------------+--------------+
-| Stanford | 8,000 | 1.5gb | 96.7 | |
-+----------------+-------------+--------+---------------+--------------+
-| NLTK | 543 | 61mb | 94.0 | |
-+----------------+-------------+--------+---------------+--------------+
+Tokenization done right
+=======================
+
+Most tokenizers rely on complicated regular expressions. Often, they leave you
+with no way to align the tokens back to the original string --- a vital feature
+if you want to display some mark-up, such as spelling correction. The regular
+expressions also interact, making it hard to accommodate special cases.
+
+spaCy introduces a **novel tokenization algorithm** that's much faster and much
+more flexible:
+
+.. code-block:: python
+
+ def tokenize(string, prefixes={}, suffixes={}, specials={}):
+ '''Sketch of spaCy's tokenization algorithm.'''
+ tokens = []
+ cache = {}
+ for chunk in string.split():
+ # Because of Zipf's law, the cache serves the majority of "chunks".
+ if chunk in cache:
+ tokens.extend(cache[chunl])
+ continue
+ key = chunk
+
+ subtokens = []
+ # Process a chunk by splitting off prefixes e.g. ( " { and suffixes e.g. , . :
+ # If we split one off, check whether we're left with a special-case,
+ # e.g. contractions (can't, won't, etc), emoticons, abbreviations, etc.
+ # This makes the tokenization easy to update and customize.
+ while chunk:
+ prefix, chunk = _consume_prefix(chunk, prefixes)
+ if prefix:
+ subtokens.append(prefix)
+ if chunk in specials:
+ subtokens.extend(specials[chunk])
+ break
+ suffix, chunk = _consume_suffix(chunk, suffixes)
+ if suffix:
+ subtokens.append(suffix)
+ if chunk in specials:
+ subtokens.extend(specials[chunk])
+ break
+ cache[key] = subtokens
+
+Your data is going to have its own quirks, so it's really useful to have
+a tokenizer you can easily control. To see the limitations of the standard
+regex-based approach, check out `CMU's recent work on tokenizing tweets `_. Despite a lot of careful attention, they can't handle all of their
+known emoticons correctly --- doing so would interfere with the way they
+process other punctuation. This isn't a problem for spaCy: we just add them
+all to the special tokenization rules.
+
+spaCy's tokenizer is also incredibly efficient:
+
+spaCy can create an inverted index of the 1.8 billion word Gigaword corpus,
+in under half an hour --- on a Macbook Air. See the `inverted
+index tutorial`_.
+
+.. _inverted index tutorial: index_tutorial.html
+
+Comparison with NLTK
+====================
+
+`NLTK `_ provides interfaces to a wide-variety of NLP
+tools and resources, and its own implementations of a few algorithms. It comes
+with comprehensive documentation, and a book introducing concepts in NLP. For
+these reasons, it's very widely known. However, if you're trying to make money
+or do cutting-edge research, NLTK is not a good choice.
+
+The `list of stuff in NLTK `_ looks impressive,
+but almost none of it is useful for real work. You're not going to make any money,
+or do top research, by using the NLTK chat bots, theorem provers, toy CCG implementation,
+etc. Most of NLTK is there to assist in the explanation ideas in computational
+linguistics, at roughly an undergraduate level.
+But it also claims to support serious work, by wrapping external tools.
+
+In a pretty well known essay, Joel Spolsky discusses the pain of dealing with
+`leaky abstractions `_.
+An abstraction tells you to not care about implementation
+details, but sometimes the implementation matters after all. When it
+does, you have to waste time revising your assumptions.
+
+NLTK's wrappers call external tools via subprocesses, and wrap this up so
+that it looks like a native API. This abstraction leaks *a lot*. The system
+calls impose far more overhead than a normal Python function call, which makes
+the most natural way to program against the API infeasible.
+
+
+Case study: POS tagging
+-----------------------
+
+Here's a quick comparison of the following POS taggers:
+
+* **Stanford (CLI)**: The Stanford POS tagger, invoked once as a batch process
+ from the command-line;
+* **nltk.tag.stanford**: The Stanford tagger, invoked document-by-document via
+ NLTK's wrapper;
+* **nltk.pos_tag**: NLTK's own POS tagger, invoked document-by-document.
+* **spacy.en.pos_tag**: spaCy's POS tagger, invoked document-by-document.
+
+
++-------------------+-------------+--------+
+| System | Speed (w/s) | % Acc. |
++-------------------+-------------+--------+
+| spaCy | 107,000 | 96.7 |
++-------------------+-------------+--------+
+| Stanford (CLI) | 8,000 | 96.7 |
++-------------------+-------------+--------+
+| nltk.pos_tag | 543 | 94.0 |
++-------------------+-------------+--------+
+| nltk.tag.stanford | 209 | 96.7 |
++-------------------+-------------+--------+
+
+Experimental details TODO. Three things are apparent from this comparison:
+
+1. The native NLTK tagger, nltk.pos_tag, is both slow and inaccurate;
+
+2. Calling the Stanford tagger document-by-document via NLTK is **40x** slower
+ than invoking the model once as a batch process, via the command-line;
+
+3. spaCy is over 10x faster than the Stanford tagger, even when called
+ **sentence-by-sentence**.
+
+The problem is that NLTK simply wraps the command-line
+interfaces of these tools, so communication is via a subprocess. NLTK does not
+even hold open a pipe for you --- the model is reloaded, again and again.
+
+To use the wrapper effectively, you should batch up your text as much as possible.
+This probably isn't how you would like to structure your pipeline, and you
+might not be able to batch up much text at all, e.g. if serving a single
+request means processing a single document.
+Technically, NLTK does give you Python functions to access lots of different
+systems --- but, you can't use them as you would expect to use a normal Python
+function. The abstraction leaks.
+
+Here's the bottom-line: the Stanford tools are written in Java, so using them
+from Python sucks. You shouldn't settle for this. It's a problem that springs
+purely from the tooling, rather than the domain.
+
+Summary
+-------
+
+NLTK is a well-known Python library for NLP, but for the important bits, you
+don't get actual Python modules. You get wrappers which throw to external
+tools, via subprocesses. This is not at all the same thing.
+
+spaCy is implemented in Cython, just like numpy, scikit-learn, lxml and other
+high-performance Python libraries. So you get a native Python API, but the
+performance you expect from a program written in C.
.. toctree::
:hidden:
:maxdepth: 3
-
- what/index.rst
- why/index.rst
- how/index.rst
+
+ features.rst
+ license_stories.rst
diff --git a/setup.py b/setup.py
index c67bed4a1..827d44fc6 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,8 @@ import os.path
from os import path
from glob import glob
+import numpy
+
def clean(ext):
for pyx in ext.sources:
@@ -34,7 +36,7 @@ compile_args = []
link_args = []
libs = []
-includes = ['.']
+includes = ['.', numpy.get_include()]
cython_includes = ['.']
@@ -50,18 +52,20 @@ exts = [
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
- Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
- Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
- Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
- Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
- Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
- Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
+ Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
+ Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
+ Extension("spacy.morphology", ["spacy/morphology.pyx"], language="c++",
+ include_dirs=includes),
+ #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
+ #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
+ #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
+ #Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
#Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes),
- Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
- Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
- Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
- Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
+ #Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
+ #Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
+ #Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
+ #Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
]
diff --git a/spacy/context.pxd b/spacy/context.pxd
deleted file mode 100644
index 8f798d347..000000000
--- a/spacy/context.pxd
+++ /dev/null
@@ -1,66 +0,0 @@
-from thinc.typedefs cimport atom_t
-from .typedefs cimport hash_t
-from .tokens cimport Tokens
-from .lexeme cimport Lexeme
-
-
-cdef class Token:
- cdef readonly atom_t sic
- cdef readonly atom_t cluster
- cdef readonly atom_t norm
- cdef readonly atom_t shape
- cdef readonly atom_t asciied
- cdef readonly atom_t prefix
- cdef readonly atom_t suffix
- cdef readonly atom_t length
-
- cdef readonly atom_t postype
- cdef readonly atom_t nertype
- cdef readonly atom_t sensetype
-
- cdef readonly atom_t is_alpha
- cdef readonly atom_t is_ascii
- cdef readonly atom_t is_digit
- cdef readonly atom_t is_lower
- cdef readonly atom_t is_punct
- cdef readonly atom_t is_space
- cdef readonly atom_t is_title
- cdef readonly atom_t is_upper
- cdef readonly atom_t like_url
- cdef readonly atom_t like_number
- cdef readonly atom_t oft_lower
- cdef readonly atom_t oft_title
- cdef readonly atom_t oft_upper
-
- cdef readonly atom_t in_males
- cdef readonly atom_t in_females
- cdef readonly atom_t in_surnames
- cdef readonly atom_t in_places
- cdef readonly atom_t in_games
- cdef readonly atom_t in_celebs
- cdef readonly atom_t in_names
-
- cdef readonly atom_t pos
- cdef readonly atom_t sense
- cdef readonly atom_t ner
-
-
-cdef class Slots:
- cdef readonly Token P4
- cdef readonly Token P3
- cdef readonly Token P2
- cdef readonly Token P1
- cdef readonly Token N0
- cdef readonly Token N1
- cdef readonly Token N2
- cdef readonly Token N3
- cdef readonly Token N4
-
-
-cdef int N_FIELDS
-
-
-cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
-
-
-cpdef Slots FIELD_IDS
diff --git a/spacy/context.pyx b/spacy/context.pyx
deleted file mode 100644
index aeb78ae5c..000000000
--- a/spacy/context.pyx
+++ /dev/null
@@ -1,126 +0,0 @@
-from murmurhash.mrmr cimport hash64
-from .lexeme cimport *
-
-
-cdef class Slots:
- def __init__(self):
- self.P4 = Token()
- self.P3 = Token()
- self.P2 = Token()
- self.P1 = Token()
- self.N0 = Token()
- self.N1 = Token()
- self.N2 = Token()
- self.N3 = Token()
- self.N4 = Token()
-
-
-cdef void _number_token(Token t, int* n_fields):
- cdef int i = n_fields[0]
- t.sic = i; i += 1
- t.cluster = i; i += 1
- t.norm = i; i += 1
- t.shape = i; i += 1
- t.prefix = i; i += 1
- t.suffix = i; i += 1
- t.length = i; i += 1
-
- t.postype = i; i += 1
- t.nertype = i; i += 1
- t.sensetype = i; i += 1
-
- t.is_alpha = i; i += 1
- t.is_ascii = i; i += 1
- t.is_digit = i; i += 1
- t.is_lower = i; i += 1
- t.is_punct = i; i += 1
- t.is_space = i; i += 1
- t.is_title = i; i += 1
- t.is_upper = i; i += 1
-
- t.like_number = i; i += 1
- t.like_url = i; i += 1
-
- t.oft_lower = i; i += 1
- t.oft_title = i; i += 1
- t.oft_upper = i; i += 1
-
- t.in_males = i; i += 1
- t.in_females = i; i += 1
- t.in_surnames = i; i += 1
- t.in_places = i; i += 1
- t.in_games = i; i += 1
- t.in_celebs = i; i += 1
- t.in_names = i; i += 1
-
- t.pos = i; i += 1
- t.sense = i; i += 1
- t.ner = i; i += 1
-
- n_fields[0] = i
-
-
-cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner):
- c[t.sic] = lex.sic
- c[t.cluster] = lex.cluster
- c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
- c[t.shape] = lex.shape
- c[t.asciied] = lex.asciied
- c[t.prefix] = lex.prefix
- c[t.suffix] = lex.suffix
- c[t.length] = lex.length
-
- c[t.postype] = lex.postype
- c[t.nertype] = 0
- c[t.sensetype] = 0
-
- c[t.is_alpha] = lex.flags & (1 << IS_ALPHA)
- c[t.is_digit] = lex.flags & (1 << IS_DIGIT)
- c[t.is_lower] = lex.flags & (1 << IS_LOWER)
- c[t.is_punct] = lex.flags & (1 << IS_PUNCT)
- c[t.is_space] = lex.flags & (1 << IS_SPACE)
- c[t.is_title] = lex.flags & (1 << IS_TITLE)
- c[t.is_upper] = lex.flags & (1 << IS_UPPER)
- c[t.like_url] = lex.flags & (1 << LIKE_URL)
- c[t.like_number] = lex.flags & (1 << LIKE_NUMBER)
- c[t.oft_lower] = lex.flags & (1 << OFT_LOWER)
- c[t.oft_title] = lex.flags & (1 << OFT_TITLE)
- c[t.oft_upper] = lex.flags & (1 << OFT_UPPER)
-
- c[t.in_males] = lex.flags & (1 << IN_MALES)
- c[t.in_females] = lex.flags & (1 << IN_FEMALES)
- c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES)
- c[t.in_places] = lex.flags & (1 << IN_PLACES)
- c[t.in_games] = lex.flags & (1 << IN_GAMES)
- c[t.in_celebs] = lex.flags & (1 << IN_CELEBS)
- c[t.in_names] = lex.flags & (1 << IN_NAMES)
-
- c[t.pos] = pos
- c[t.sense] = 0
- c[t.ner] = ner
-
-
-cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
- _fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4])
- _fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3])
- _fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
- _fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
- _fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
- _fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
- _fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
- _fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3])
- _fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4])
- return 1
-
-
-N_FIELDS = 0
-FIELD_IDS = Slots()
-_number_token(FIELD_IDS.P4, &N_FIELDS)
-_number_token(FIELD_IDS.P3, &N_FIELDS)
-_number_token(FIELD_IDS.P2, &N_FIELDS)
-_number_token(FIELD_IDS.P1, &N_FIELDS)
-_number_token(FIELD_IDS.N0, &N_FIELDS)
-_number_token(FIELD_IDS.N1, &N_FIELDS)
-_number_token(FIELD_IDS.N2, &N_FIELDS)
-_number_token(FIELD_IDS.N3, &N_FIELDS)
-_number_token(FIELD_IDS.N4, &N_FIELDS)
diff --git a/spacy/en.pxd b/spacy/en.pxd
index a7c643eba..2ca081e47 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -1,5 +1,133 @@
-from spacy.lang cimport Language
-from spacy.tokens cimport Tokens
+from thinc.typedefs cimport atom_t
+
+from .lang cimport Language
+from .tokens cimport Tokens
+from .tokens cimport TokenC
+
+
+cpdef enum en_person_t:
+ NO_PERSON
+ FIRST
+ SECOND
+ THIRD
+ NON_THIRD
+
+
+cpdef enum en_number_t:
+ NO_NUMBER
+ SINGULAR
+ PLURAL
+ MASS
+
+
+cpdef enum en_gender_t:
+ NO_GENDER
+ MASCULINE
+ FEMININE
+ NEUTER
+
+
+cpdef enum en_case_t:
+ NO_CASE
+ NOMINATIVE
+ GENITIVE
+ ACCUSATIVE
+ REFLEXIVE
+ DEMONYM
+
+
+cpdef enum en_tenspect_t:
+ NO_TENSE
+ BASE_VERB
+ PRESENT
+ PAST
+ PASSIVE
+ ING
+ MODAL
+
+
+cpdef enum misc_t:
+ NO_MISC
+ COMPARATIVE
+ SUPERLATIVE
+ RELATIVE
+ NAME
+
+
+# Flags
+cpdef enum FlagID:
+ IS_ALPHA
+ IS_ASCII
+ IS_DIGIT
+ IS_LOWER
+ IS_PUNCT
+ IS_SPACE
+ IS_TITLE
+ IS_UPPER
+
+ LIKE_URL
+ LIKE_NUMBER
+
+ OFT_LOWER
+ OFT_TITLE
+ OFT_UPPER
+
+ IN_MALES
+ IN_FEMALES
+ IN_SURNAMES
+ IN_PLACES
+ IN_GAMES
+ IN_CELEBS
+ IN_NAMES
+
+
+cpdef enum:
+ P2_sic
+ P2_cluster
+ P2_shape
+ P2_prefix
+ P2_suffix
+ P2_pos
+ P2_lemma
+ P2_pos_type
+
+ P1_sic
+ P1_cluster
+ P1_shape
+ P1_prefix
+ P1_suffix
+ P1_pos
+ P1_lemma
+ P1_pos_type
+
+ W_sic
+ W_cluster
+ W_shape
+ W_prefix
+ W_suffix
+ W_pos
+ W_lemma
+ W_pos_type
+
+ N1_sic
+ N1_cluster
+ N1_shape
+ N1_prefix
+ N1_suffix
+ N1_pos
+ N1_lemma
+ N1_pos_type
+
+ N2_sic
+ N2_cluster
+ N2_shape
+ N2_prefix
+ N2_suffix
+ N2_pos
+ N2_lemma
+ N2_pos_type
+
+ N_CONTEXT_FIELDS
cdef class English(Language):
diff --git a/spacy/en.pyx b/spacy/en.pyx
index 95c1cbd94..3ed0eaaa9 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -30,14 +30,101 @@ same scheme. Tokenization problems are a major cause of poor performance for
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
provides a fully Penn Treebank 3-compliant tokenizer.
'''
-# TODO
-#The script translate_treebank_tokenization can be used to transform a treebank's
-#annotation to use one of the spacy tokenization schemes.
-
-
from __future__ import unicode_literals
cimport lang
+from .typedefs cimport flags_t
+import orth
+from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
+from .morphology cimport X, PUNCT, EOL
+
+from .tokens cimport Morphology
+
+
+POS_TAGS = {
+ 'NULL': (NO_TAG, {}),
+ 'EOL': (EOL, {}),
+ 'CC': (CONJ, {}),
+ 'CD': (NUM, {}),
+ 'DT': (DET, {}),
+ 'EX': (DET, {}),
+ 'FW': (X, {}),
+ 'IN': (ADP, {}),
+ 'JJ': (ADJ, {}),
+ 'JJR': (ADJ, {'misc': COMPARATIVE}),
+ 'JJS': (ADJ, {'misc': SUPERLATIVE}),
+ 'LS': (X, {}),
+ 'MD': (VERB, {'tenspect': MODAL}),
+ 'NN': (NOUN, {}),
+ 'NNS': (NOUN, {'number': PLURAL}),
+ 'NNP': (NOUN, {'misc': NAME}),
+ 'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
+ 'PDT': (DET, {}),
+ 'POS': (PRT, {'case': GENITIVE}),
+ 'PRP': (NOUN, {}),
+ 'PRP$': (NOUN, {'case': GENITIVE}),
+ 'RB': (ADV, {}),
+ 'RBR': (ADV, {'misc': COMPARATIVE}),
+ 'RBS': (ADV, {'misc': SUPERLATIVE}),
+ 'RP': (PRT, {}),
+ 'SYM': (X, {}),
+ 'TO': (PRT, {}),
+ 'UH': (X, {}),
+ 'VB': (VERB, {}),
+ 'VBD': (VERB, {'tenspect': PAST}),
+ 'VBG': (VERB, {'tenspect': ING}),
+ 'VBN': (VERB, {'tenspect': PASSIVE}),
+ 'VBP': (VERB, {'tenspect': PRESENT}),
+ 'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
+ 'WDT': (DET, {'misc': RELATIVE}),
+ 'WP': (PRON, {'misc': RELATIVE}),
+ 'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
+ 'WRB': (ADV, {'misc': RELATIVE}),
+ '!': (PUNCT, {}),
+ '#': (PUNCT, {}),
+ '$': (PUNCT, {}),
+ "''": (PUNCT, {}),
+ "(": (PUNCT, {}),
+ ")": (PUNCT, {}),
+ "-LRB-": (PUNCT, {}),
+ "-RRB-": (PUNCT, {}),
+ ".": (PUNCT, {}),
+ ",": (PUNCT, {}),
+ "``": (PUNCT, {}),
+ ":": (PUNCT, {}),
+ "?": (PUNCT, {}),
+}
+
+
+POS_TEMPLATES = (
+ (W_sic,),
+ (P1_lemma, P1_pos),
+ (P2_lemma, P2_pos),
+ (N1_sic,),
+ (N2_sic,),
+
+ (W_suffix,),
+ (W_prefix,),
+
+ (P1_pos,),
+ (P2_pos,),
+ (P1_pos, P2_pos),
+ (P1_pos, W_sic),
+ (P1_suffix,),
+ (N1_suffix,),
+
+ (W_shape,),
+ (W_cluster,),
+ (N1_cluster,),
+ (N2_cluster,),
+ (P1_cluster,),
+ (P2_cluster,),
+
+ (W_pos_type,),
+ (N1_pos_type,),
+ (N1_pos_type,),
+ (P1_pos, W_pos_type, N1_pos_type),
+)
cdef class English(Language):
@@ -47,7 +134,68 @@ cdef class English(Language):
name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method.
"""
- pass
+ def get_props(self, unicode string):
+ return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
+
+ def set_flags(self, unicode string):
+ cdef flags_t flags = 0
+ flags |= orth.is_alpha(string) << IS_ALPHA
+ flags |= orth.is_ascii(string) << IS_ASCII
+ flags |= orth.is_digit(string) << IS_DIGIT
+ flags |= orth.is_lower(string) << IS_LOWER
+ flags |= orth.is_punct(string) << IS_PUNCT
+ flags |= orth.is_space(string) << IS_SPACE
+ flags |= orth.is_title(string) << IS_TITLE
+ flags |= orth.is_upper(string) << IS_UPPER
+
+ flags |= orth.like_url(string) << LIKE_URL
+ flags |= orth.like_number(string) << LIKE_NUMBER
+ return flags
+
+ def set_pos(self, Tokens tokens):
+ cdef int i
+ cdef atom_t[N_CONTEXT_FIELDS] context
+ cdef TokenC* t = tokens.data
+ assert self.morphologizer is not None
+ cdef dict tagdict = self.pos_tagger.tagdict
+ for i in range(tokens.length):
+ if t[i].lex.sic in tagdict:
+ t[i].pos = tagdict[t[i].lex.sic]
+ else:
+ fill_pos_context(context, i, t)
+ t[i].pos = self.pos_tagger.predict(context)
+ self.morphologizer.set_morph(i, t)
+
+ def train_pos(self, Tokens tokens, golds):
+ cdef int i
+ cdef atom_t[N_CONTEXT_FIELDS] context
+ c = 0
+ cdef TokenC* t = tokens.data
+ for i in range(tokens.length):
+ fill_pos_context(context, i, t)
+ t[i].pos = self.pos_tagger.predict(context, [golds[i]])
+ self.morphologizer.set_morph(i, t)
+ c += t[i].pos == golds[i]
+ return c
+
+
+cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
+ _fill_from_token(&context[P2_sic], &tokens[i-2])
+ _fill_from_token(&context[P1_sic], &tokens[i-1])
+ _fill_from_token(&context[W_sic], &tokens[i])
+ _fill_from_token(&context[N1_sic], &tokens[i+1])
+ _fill_from_token(&context[N2_sic], &tokens[i+2])
+
+
+cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
+ context[0] = t.lex.sic
+ context[1] = t.lex.cluster
+ context[2] = t.lex.shape
+ context[3] = t.lex.prefix
+ context[4] = t.lex.suffix
+ context[5] = t.pos
+ context[6] = t.lemma
+ context[7] = t.lex.pos_type
EN = English('en')
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 68f1ee58a..20986f134 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -1,38 +1,38 @@
from libcpp.vector cimport vector
+from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
+
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .typedefs cimport hash_t
-from .tokens cimport Tokens
+from .tokens cimport Tokens, TokenC
from .lexeme cimport Lexeme
from .tagger cimport Tagger
-from .ner.greedy_parser cimport NERParser
-from .utf8string cimport StringStore
+from .utf8string cimport StringStore, UniStr
+from .morphology cimport Morphologizer
-cdef extern from "Python.h":
- cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
- cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
- cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
- cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
+cdef union LexemesOrTokens:
+ const Lexeme* const* lexemes
+ TokenC* tokens
-cdef struct String:
- Py_UNICODE* chars
- size_t n
- hash_t key
+cdef struct Cached:
+ LexemesOrTokens data
+ bint is_lex
+ int length
cdef class Lexicon:
+ cpdef public get_lex_props
cdef Pool mem
- cpdef readonly size_t size
cpdef readonly StringStore strings
cdef vector[Lexeme*] lexemes
- cdef Lexeme* get(self, String* s) except NULL
+ cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
- cdef PreshMap _dict
+ cdef PreshMap _map
cdef class Language:
@@ -41,9 +41,8 @@ cdef class Language:
cdef PreshMap _cache
cdef PreshMap _specials
cpdef readonly Lexicon lexicon
-
cpdef readonly Tagger pos_tagger
- cpdef readonly NERParser ner_tagger
+ cpdef readonly Morphologizer morphologizer
cdef object _prefix_re
cdef object _suffix_re
@@ -52,13 +51,14 @@ cdef class Language:
cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text)
- cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
- cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
+ cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
+ cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
+ cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except NULL
- cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
+ cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
- cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1
+ cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 79a84e936..4617c3853 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -18,13 +18,14 @@ from preshed.maps cimport PreshMap
from .lexeme cimport Lexeme
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init
+from .lexeme cimport check_flag
+
+from .utf8string cimport slice_unicode
from . import util
from .util import read_lang_data
from .tokens import Tokens
-
-from .tagger cimport Tagger
-from .ner.greedy_parser cimport NERParser
+from .tokens cimport Morphology
cdef class Language:
@@ -37,29 +38,30 @@ cdef class Language:
self._prefix_re = re.compile(prefix)
self._suffix_re = re.compile(suffix)
self._infix_re = re.compile(infix)
- self.lexicon = Lexicon()
- if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
- self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
- self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
+ self.lexicon = Lexicon(self.get_props)
self._load_special_tokenization(rules)
- if path.exists(path.join(util.DATA_DIR, name, 'pos')):
- self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
- else:
- self.pos_tagger = None
- if path.exists(path.join(util.DATA_DIR, name, 'ner')):
- self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
+ self.pos_tagger = None
+ self.morphologizer = None
+
+ def load(self):
+ self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
+ self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
+ if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
+ self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
+ self.morphologizer = Morphologizer(self.lexicon.strings,
+ path.join(util.DATA_DIR, self.name))
cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])
- cdef Tokens tokens = Tokens(self.lexicon.strings, length)
+ cdef Tokens tokens = Tokens(self, length)
if length == 0:
return tokens
- cdef String string_struct
+ cdef UniStr string_struct
cdef unicode py_string
cdef int idx = 0
for i, py_string in enumerate(strings):
- string_from_unicode(&string_struct, py_string)
- tokens.push_back(idx, self.lexicon.get(&string_struct))
+ slice_unicode(&string_struct, py_string, 0, len(py_string))
+ tokens.push_back(idx, self.lexicon.get(tokens.mem, &string_struct))
idx += len(py_string) + 1
return tokens
@@ -79,22 +81,21 @@ cdef class Language:
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
"""
cdef int length = len(string)
- cdef Tokens tokens = Tokens(self.lexicon.strings, length)
+ cdef Tokens tokens = Tokens(self, length)
if length == 0:
return tokens
cdef int i = 0
cdef int start = 0
+ cdef bint cache_hit
cdef Py_UNICODE* chars = string
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
- cdef String span
+ cdef UniStr span
for i in range(1, length):
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i:
- string_slice(&span, chars, start, i)
- lexemes = self._cache.get(span.key)
- if lexemes != NULL:
- tokens.extend(start, lexemes, 0)
- else:
+ slice_unicode(&span, chars, start, i)
+ cache_hit = self._try_cache(start, span.key, tokens)
+ if not cache_hit:
self._tokenize(tokens, &span, start, i)
in_ws = not in_ws
start = i
@@ -102,15 +103,27 @@ cdef class Language:
start += 1
i += 1
if start < i:
- string_slice(&span, chars, start, i)
- lexemes = self._cache.get(span.key)
- if lexemes != NULL:
- tokens.extend(start, lexemes, 0)
- else:
+ slice_unicode(&span, chars, start, i)
+ cache_hit = self._try_cache(start, span.key, tokens)
+ if not cache_hit:
self._tokenize(tokens, &span, start, i)
return tokens
- cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
+ cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
+ #cached = self._specials.get(key)
+ cached = self._cache.get(key)
+ if cached == NULL:
+ return False
+ cdef int i
+ if cached.is_lex:
+ for i in range(cached.length):
+ idx = tokens.push_back(idx, cached.data.lexemes[i])
+ else:
+ for i in range(cached.length):
+ idx = tokens.push_back(idx, &cached.data.tokens[i])
+ return True
+
+ cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
cdef vector[Lexeme*] prefixes
cdef vector[Lexeme*] suffixes
cdef hash_t orig_key
@@ -119,88 +132,95 @@ cdef class Language:
orig_size = tokens.length
self._split_affixes(span, &prefixes, &suffixes)
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
- self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
+ self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
- cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
- vector[Lexeme*] *suffixes) except NULL:
+ cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
+ vector[const Lexeme*] *suffixes) except NULL:
cdef size_t i
- cdef String prefix
- cdef String suffix
- cdef String minus_pre
- cdef String minus_suf
+ cdef UniStr prefix
+ cdef UniStr suffix
+ cdef UniStr minus_pre
+ cdef UniStr minus_suf
cdef size_t last_size = 0
while string.n != 0 and string.n != last_size:
last_size = string.n
pre_len = self._find_prefix(string.chars, string.n)
if pre_len != 0:
- string_slice(&prefix, string.chars, 0, pre_len)
- string_slice(&minus_pre, string.chars, pre_len, string.n)
+ slice_unicode(&prefix, string.chars, 0, pre_len)
+ slice_unicode(&minus_pre, string.chars, pre_len, string.n)
# Check whether we've hit a special-case
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
string[0] = minus_pre
- prefixes.push_back(self.lexicon.get(&prefix))
+ prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
break
suf_len = self._find_suffix(string.chars, string.n)
if suf_len != 0:
- string_slice(&suffix, string.chars, string.n - suf_len, string.n)
- string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
+ slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
+ slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
# Check whether we've hit a special-case
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
string[0] = minus_suf
- suffixes.push_back(self.lexicon.get(&suffix))
+ suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
break
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
- string_slice(string, string.chars, pre_len, string.n - suf_len)
- prefixes.push_back(self.lexicon.get(&prefix))
- suffixes.push_back(self.lexicon.get(&suffix))
+ slice_unicode(string, string.chars, pre_len, string.n - suf_len)
+ prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
+ suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
elif pre_len:
string[0] = minus_pre
- prefixes.push_back(self.lexicon.get(&prefix))
+ prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
elif suf_len:
string[0] = minus_suf
- suffixes.push_back(self.lexicon.get(&suffix))
+ suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
if self._specials.get(string.key):
break
return string
- cdef int _attach_tokens(self, Tokens tokens,
- int idx, String* string,
- vector[Lexeme*] *prefixes,
- vector[Lexeme*] *suffixes) except -1:
+ cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
+ vector[const Lexeme*] *prefixes,
+ vector[const Lexeme*] *suffixes) except -1:
+ cdef bint cache_hit
cdef int split
- cdef Lexeme** lexemes
+ cdef const Lexeme* const* lexemes
cdef Lexeme* lexeme
- cdef String span
+ cdef UniStr span
+ cdef int i
if prefixes.size():
- idx = tokens.extend(idx, prefixes.data(), prefixes.size())
+ for i in range(prefixes.size()):
+ idx = tokens.push_back(idx, prefixes[0][i])
if string.n != 0:
-
- lexemes = self._cache.get(string.key)
- if lexemes != NULL:
- idx = tokens.extend(idx, lexemes, 0)
+ cache_hit = self._try_cache(idx, string.key, tokens)
+ if cache_hit:
+ idx = tokens.data[tokens.length - 1].idx + 1
else:
split = self._find_infix(string.chars, string.n)
if split == 0 or split == -1:
- idx = tokens.push_back(idx, self.lexicon.get(string))
+ idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, string))
else:
- string_slice(&span, string.chars, 0, split)
- idx = tokens.push_back(idx, self.lexicon.get(&span))
- string_slice(&span, string.chars, split, split+1)
- idx = tokens.push_back(idx, self.lexicon.get(&span))
- string_slice(&span, string.chars, split + 1, string.n)
- idx = tokens.push_back(idx, self.lexicon.get(&span))
- cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
+ slice_unicode(&span, string.chars, 0, split)
+ idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
+ slice_unicode(&span, string.chars, split, split+1)
+ idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
+ slice_unicode(&span, string.chars, split + 1, string.n)
+ idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
+ cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
idx = tokens.push_back(idx, deref(it))
preinc(it)
- cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
- lexemes = self.mem.alloc(n + 1, sizeof(Lexeme**))
+ cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
cdef int i
for i in range(n):
- lexemes[i] = tokens[i]
- lexemes[i + 1] = NULL
- self._cache.set(key, lexemes)
+ if tokens[i].lex.id == 1:
+ return 0
+ cached = self.mem.alloc(1, sizeof(Cached))
+ cached.length = n
+ cached.is_lex = True
+ lexemes = self.mem.alloc(n, sizeof(Lexeme**))
+ for i in range(n):
+ lexemes[i] = tokens[i].lex
+ cached.data.lexemes = lexemes
+ self._cache.set(key, cached)
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length]
@@ -217,66 +237,120 @@ cdef class Language:
match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0
- def _load_special_tokenization(self, token_rules):
- '''Load special-case tokenization rules.
-
- Loads special-case tokenization rules into the Language._cache cache,
- read from data//tokenization . The special cases are loaded before
- any language data is tokenized, giving these priority. For instance,
- the English tokenization rules map "ain't" to ["are", "not"].
-
- Args:
- token_rules (list): A list of (chunk, tokens) pairs, where chunk is
- a string and tokens is a list of strings.
+ def _load_special_tokenization(self, object rules):
+ '''Add a special-case tokenization rule.
'''
+ cdef int i
+ cdef unicode chunk
+ cdef list substrings
+ cdef unicode form
+ cdef unicode lemma
+ cdef dict props
cdef Lexeme** lexemes
cdef hash_t hashed
- cdef String string
- for uni_string, substrings in token_rules:
- lexemes = self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
- for i, substring in enumerate(substrings):
- string_from_unicode(&string, substring)
- lexemes[i] = self.lexicon.get(&string)
- lexemes[i + 1] = NULL
- string_from_unicode(&string, uni_string)
- self._specials.set(string.key, lexemes)
- self._cache.set(string.key, lexemes)
+ cdef UniStr string
+ for chunk, substrings in sorted(rules.items()):
+ tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
+ for i, props in enumerate(substrings):
+ form = props['F']
+ lemma = props.get("L", None)
+ slice_unicode(&string, form, 0, len(form))
+ tokens[i].lex = self.lexicon.get(self.lexicon.mem, &string)
+ if lemma:
+ tokens[i].lemma = self.lexicon.strings[lemma]
+ set_morph_from_dict(&tokens[i].morph, props)
+ cached = self.mem.alloc(1, sizeof(Cached))
+ cached.length = len(substrings)
+ cached.is_lex = False
+ cached.data.tokens = tokens
+ slice_unicode(&string, chunk, 0, len(chunk))
+ self._specials.set(string.key, cached)
+ self._cache.set(string.key, cached)
+
+
+cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
+ morph.number = props.get('number', 0)
+ morph.tenspect = props.get('tenspect', 0)
+ morph.mood = props.get('mood', 0)
+ morph.gender = props.get('gender', 0)
+ morph.person = props.get('person', 0)
+ morph.case = props.get('case', 0)
+ morph.misc = props.get('misc', 0)
cdef class Lexicon:
- def __init__(self):
+ '''A map container for a language's Lexeme structs.
+
+ Also interns UTF-8 strings, and maps them to consecutive integer IDs.
+ '''
+ def __init__(self, object get_props):
self.mem = Pool()
- self._dict = PreshMap(2 ** 20)
+ self._map = PreshMap(2 ** 20)
self.strings = StringStore()
self.lexemes.push_back(&EMPTY_LEXEME)
- self.size = 1
+ self.get_lex_props = get_props
- cdef Lexeme* get(self, String* string) except NULL:
+ def __len__(self):
+ return self.lexemes.size()
+
+ cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
+ '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
+ if necessary, using memory acquired from the given pool. If the pool
+ is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef Lexeme* lex
- lex = self._dict.get(string.key)
+ lex = self._map.get(string.key)
if lex != NULL:
return lex
- lex = self.mem.alloc(sizeof(Lexeme), 1)
- lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
- self._dict.set(string.key, lex)
- while self.lexemes.size() < (lex.id + 1):
- self.lexemes.push_back(&EMPTY_LEXEME)
- self.lexemes[lex.id] = lex
- self.size += 1
+ if string.n < 3:
+ mem = self.mem
+ cdef unicode py_string = string.chars[:string.n]
+ lex = mem.alloc(sizeof(Lexeme), 1)
+ lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
+ self.get_lex_props(py_string))
+ if mem is self.mem:
+ self._map.set(string.key, lex)
+ while self.lexemes.size() < (lex.id + 1):
+ self.lexemes.push_back(&EMPTY_LEXEME)
+ self.lexemes[lex.id] = lex
+ else:
+ lex[0].id = 1
return lex
def __getitem__(self, id_or_string):
+ '''Retrieve a lexeme, given an int ID or a unicode string. If a previously
+ unseen unicode string is given, a new Lexeme is created and stored.
+
+ This function relies on Cython's struct-to-dict conversion. Python clients
+ receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
+ with int values. Cython clients can instead receive a Lexeme struct value.
+ More efficient Cython access is provided by Lexicon.get, which returns
+ a Lexeme*.
+
+ Args:
+ id_or_string (int or unicode): The integer ID of a word, or its unicode
+ string. If an int >= Lexicon.size, IndexError is raised.
+ If id_or_string is neither an int nor a unicode string, ValueError
+ is raised.
+
+ Returns:
+ lexeme (dict): A Lexeme struct instance, which Cython translates into
+ a dict if the operator is called from Python.
+ '''
if type(id_or_string) == int:
+ if id_or_string >= self.lexemes.size():
+ raise IndexError
return self.lexemes.at(id_or_string)[0]
- cdef String string
- string_from_unicode(&string, id_or_string)
- cdef Lexeme* lexeme = self.get(&string)
+ cdef UniStr string
+ slice_unicode(&string, id_or_string, 0, len(id_or_string))
+ cdef const Lexeme* lexeme = self.get(self.mem, &string)
return lexeme[0]
def __setitem__(self, unicode uni_string, dict props):
- cdef String s
- string_from_unicode(&s, uni_string)
- cdef Lexeme* lex = self.get(&s)
+ cdef UniStr s
+ slice_unicode(&s, uni_string, 0, len(uni_string))
+ # Cast through the const here, since we're allowed to change our own
+ # Lexemes.
+ lex = self.get(self.mem, &s)
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
def dump(self, loc):
@@ -287,11 +361,11 @@ cdef class Lexicon:
assert fp != NULL
cdef size_t st
cdef hash_t key
- for i in range(self._dict.length):
- key = self._dict.c_map.cells[i].key
+ for i in range(self._map.length):
+ key = self._map.c_map.cells[i].key
if key == 0:
continue
- lexeme = self._dict.c_map.cells[i].value
+ lexeme = self._map.c_map.cells[i].value
st = fwrite(&key, sizeof(key), 1, fp)
assert st == 1
st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
@@ -300,7 +374,8 @@ cdef class Lexicon:
assert st == 0
def load(self, loc):
- assert path.exists(loc)
+ if not path.exists(loc):
+ raise IOError('Lexemes file not found at %s' % loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef FILE* fp = fopen(bytes_loc, 'rb')
assert fp != NULL
@@ -316,21 +391,9 @@ cdef class Lexicon:
st = fread(lexeme, sizeof(Lexeme), 1, fp)
if st != 1:
break
- self._dict.set(key, lexeme)
+ self._map.set(key, lexeme)
while self.lexemes.size() < (lexeme.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lexeme.id] = lexeme
i += 1
- self.size += 1
fclose(fp)
-
-
-cdef void string_from_unicode(String* s, unicode uni):
- cdef Py_UNICODE* c_uni = uni
- string_slice(s, c_uni, 0, len(uni))
-
-
-cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
- s.chars = &chars[start]
- s.n = end - start
- s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
new file mode 100644
index 000000000..ce9bbefdc
--- /dev/null
+++ b/spacy/lemmatizer.py
@@ -0,0 +1,90 @@
+from os import path
+
+
+NOUN_RULES = (
+ ('s', ''),
+ ('ses', 's'),
+ ('ves', 'f'),
+ ('xes', 'x'),
+ ('zes', 'z'),
+ ('ches', 'ch'),
+ ('shes', 'sh'),
+ ('men', 'man'),
+ ('ies', 'y')
+)
+
+
+VERB_RULES = (
+ ("s", ""),
+ ("ies", "y"),
+ ("es", "e"),
+ ("es", ""),
+ ("ed", "e"),
+ ("ed", ""),
+ ("ing", "e"),
+ ("ing", "")
+)
+
+
+ADJ_RULES = (
+ ("er", ""),
+ ("est", ""),
+ ("er", "e"),
+ ("est", "e")
+)
+
+
+class Lemmatizer(object):
+ def __init__(self, wn_dict_dir):
+ self.index = {}
+ self.exc = {}
+ for pos in ['adj', 'adv', 'noun', 'verb']:
+ self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
+ self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
+
+ def noun(self, string):
+ return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
+
+ def verb(self, string):
+ return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
+
+ def adj(self, string):
+ return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
+
+
+def lemmatize(string, index, exceptions, rules):
+ string = string.lower()
+ forms = []
+ if string in index:
+ forms.append(string)
+ forms.extend(exceptions.get(string, []))
+ for old, new in rules:
+ if string.endswith(old):
+ form = string[:len(string) - len(old)] + new
+ if form in index:
+ forms.append(form)
+ if not forms:
+ forms.append(string)
+ return set(forms)
+
+
+def read_index(loc):
+ index = set()
+ for line in open(loc):
+ if line.startswith(' '):
+ continue
+ pieces = line.split()
+ word = pieces[0]
+ if word.count('_') == 0:
+ index.add(word)
+ return index
+
+
+def read_exc(loc):
+ exceptions = {}
+ for line in open(loc):
+ if line.startswith(' '):
+ continue
+ pieces = line.split()
+ exceptions[pieces[0]] = tuple(pieces[1:])
+ return exceptions
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 0d7d206e5..a6f20906b 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,61 +1,137 @@
-from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
+from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
from .utf8string cimport StringStore
-from libc.stdint cimport uint16_t
-cpdef flag_t OOV_DIST_FLAGS
-# Flags
-cpdef enum:
- IS_ALPHA
- IS_ASCII
- IS_DIGIT
- IS_LOWER
- IS_PUNCT
- IS_SPACE
- IS_TITLE
- IS_UPPER
+# Reserve 64 values for flag features
+cpdef enum attr_id_t:
+ FLAG0
+ FLAG1
+ FLAG2
+ FLAG3
+ FLAG4
+ FLAG5
+ FLAG6
+ FLAG7
+ FLAG8
+ FLAG9
+ FLAG10
+ FLAG11
+ FLAG12
+ FLAG13
+ FLAG14
+ FLAG15
+ FLAG16
+ FLAG17
+ FLAG18
+ FLAG19
+ FLAG20
+ FLAG21
+ FLAG22
+ FLAG23
+ FLAG24
+ FLAG25
+ FLAG26
+ FLAG27
+ FLAG28
+ FLAG29
+ FLAG30
+ FLAG31
+ FLAG32
+ FLAG33
+ FLAG34
+ FLAG35
+ FLAG36
+ FLAG37
+ FLAG38
+ FLAG39
+ FLAG40
+ FLAG41
+ FLAG42
+ FLAG43
+ FLAG44
+ FLAG45
+ FLAG46
+ FLAG47
+ FLAG48
+ FLAG49
+ FLAG50
+ FLAG51
+ FLAG52
+ FLAG53
+ FLAG54
+ FLAG55
+ FLAG56
+ FLAG57
+ FLAG58
+ FLAG59
+ FLAG60
+ FLAG61
+ FLAG62
+ FLAG63
- LIKE_URL
- LIKE_NUMBER
+ ID
+ SIC
+ DENSE
+ SHAPE
+ PREFIX
+ SUFFIX
- OFT_LOWER
- OFT_TITLE
- OFT_UPPER
-
- IN_MALES
- IN_FEMALES
- IN_SURNAMES
- IN_PLACES
- IN_GAMES
- IN_CELEBS
- IN_NAMES
+ LENGTH
+ CLUSTER
+ POS_TYPE
+ LEMMA
cdef struct Lexeme:
- flag_t flags
+ flags_t flags
- id_t id
- id_t sic
- id_t norm
- id_t shape
- id_t asciied
- id_t prefix
- id_t suffix
+ attr_t id
+ attr_t sic
+ attr_t dense
+ attr_t shape
+ attr_t prefix
+ attr_t suffix
+
+ attr_t length
+ attr_t cluster
+ attr_t pos_type
float prob
-
- len_t length
- tag_t cluster
- tag_t postype
- tag_t supersense
+ float sentiment
cdef Lexeme EMPTY_LEXEME
-cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
- StringStore store, dict props) except *
+
+cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
+ dict props) except *
-cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
+cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
+
+
+cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
+ if feat_name < (sizeof(flags_t) * 8):
+ return check_flag(lex, feat_name)
+ elif feat_name == ID:
+ return lex.id
+ elif feat_name == SIC:
+ return lex.sic
+ elif feat_name == DENSE:
+ return lex.dense
+ elif feat_name == SHAPE:
+ return lex.shape
+ elif feat_name == PREFIX:
+ return lex.prefix
+ elif feat_name == SUFFIX:
+ return lex.suffix
+ elif feat_name == LENGTH:
+ return lex.length
+ elif feat_name == CLUSTER:
+ return lex.cluster
+ elif feat_name == POS_TYPE:
+ return lex.pos_type
+ else:
+ return 0
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 64eb699a6..f1974cbc9 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -6,67 +6,25 @@ from libc.string cimport memset
import orth
-from .utf8string cimport Utf8Str
-
-OOV_DIST_FLAGS = 0
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
-def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
- cdef flag_t flags = 0
- flags |= orth.is_alpha(string) << IS_ALPHA
- flags |= orth.is_ascii(string) << IS_ASCII
- flags |= orth.is_digit(string) << IS_DIGIT
- flags |= orth.is_lower(string) << IS_LOWER
- flags |= orth.is_punct(string) << IS_PUNCT
- flags |= orth.is_space(string) << IS_SPACE
- flags |= orth.is_title(string) << IS_TITLE
- flags |= orth.is_upper(string) << IS_UPPER
-
- flags |= orth.like_url(string) << LIKE_URL
- flags |= orth.like_number(string) << LIKE_NUMBER
- return flags
-
-
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
- StringStore store, dict props) except *:
+ StringStore string_store, dict props) except *:
cdef Lexeme lex
lex.id = i
lex.length = len(string)
- lex.sic = get_string_id(string, store)
+ lex.sic = string_store[string]
lex.cluster = props.get('cluster', 0)
- lex.postype = props.get('postype', 0)
- lex.supersense = props.get('supersense', 0)
+ lex.pos_type = props.get('pos_type', 0)
lex.prob = props.get('prob', 0)
- cdef float upper_pc = props.get('upper_pc', 0.0)
- cdef float lower_pc = props.get('lower_pc', 0.0)
- cdef float title_pc = props.get('title_pc', 0.0)
-
- lex.prefix = get_string_id(string[0], store)
- lex.suffix = get_string_id(string[-3:], store)
- if upper_pc or lower_pc or title_pc:
- canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
- lex.norm = get_string_id(canon_cased, store)
- else:
- lex.norm = lex.sic
- lex.shape = get_string_id(orth.word_shape(string), store)
- lex.asciied = get_string_id(orth.asciied(string), store)
- lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
-
- lex.flags |= props.get('in_males', 0) << IN_MALES
- lex.flags |= props.get('in_females', 0) << IN_FEMALES
- lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
- lex.flags |= props.get('in_places', 0) << IN_PLACES
- lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
- lex.flags |= props.get('in_games', 0) << IN_GAMES
- lex.flags |= props.get('in_names', 0) << IN_NAMES
+ lex.prefix = string_store[string[:1]]
+ lex.suffix = string_store[string[-3:]]
+ lex.shape = string_store[orth.word_shape(string)]
+ lex.dense = string_store[props['dense']]
+
+ lex.flags = props.get('flags', 0)
return lex
-
-
-cdef id_t get_string_id(unicode string, StringStore store) except 0:
- cdef bytes byte_string = string.encode('utf8')
- cdef Utf8Str* orig_str = store.intern(byte_string, len(byte_string))
- return orig_str.i
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
new file mode 100644
index 000000000..9c5d342e9
--- /dev/null
+++ b/spacy/morphology.pxd
@@ -0,0 +1,45 @@
+
+from .tokens cimport TokenC
+from .lexeme cimport Lexeme
+from .utf8string cimport StringStore
+from .typedefs cimport id_t, Morphology
+
+from preshed.maps cimport PreshMapArray
+from cymem.cymem cimport Pool
+
+
+# Google universal tag set
+cpdef enum univ_tag_t:
+ NO_TAG
+ ADJ
+ ADV
+ ADP
+ CONJ
+ DET
+ NOUN
+ NUM
+ PRON
+ PRT
+ VERB
+ X
+ PUNCT
+ EOL
+ N_UNIV_TAGS
+
+
+cdef struct PosTag:
+ Morphology morph
+ int id
+ univ_tag_t pos
+
+
+cdef class Morphologizer:
+ cdef Pool mem
+ cdef StringStore strings
+ cdef object lemmatizer
+ cdef PosTag* tags
+ cdef readonly list tag_names
+
+ cdef PreshMapArray _cache
+ cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
+ cdef int set_morph(self, const int i, TokenC* tokens) except -1
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
new file mode 100644
index 000000000..346c778a9
--- /dev/null
+++ b/spacy/morphology.pyx
@@ -0,0 +1,117 @@
+# cython: profile=True
+# cython: embedsignature=True
+from os import path
+import json
+
+from .lemmatizer import Lemmatizer
+from .typedefs cimport id_t
+
+UNIV_TAGS = {
+ 'NULL': NO_TAG,
+ 'ADJ': ADJ,
+ 'ADV': ADV,
+ 'ADP': ADP,
+ 'CONJ': CONJ,
+ 'DET': DET,
+ 'NOUN': NOUN,
+ 'NUM': NUM,
+ 'PRON': PRON,
+ 'PRT': PRT,
+ 'VERB': VERB,
+ 'X': X,
+ '.': PUNCT,
+ 'EOL': EOL
+}
+
+
+cdef struct _Cached:
+ Morphology morph
+ int lemma
+
+
+cdef class Morphologizer:
+ """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
+ """
+ def __init__(self, StringStore strings, data_dir):
+ self.mem = Pool()
+ self.strings = strings
+ cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
+ tag_map = cfg['tag_map']
+ self.tag_names = cfg['tag_names']
+ self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
+ self._cache = PreshMapArray(len(self.tag_names))
+ self.tags = self.mem.alloc(len(self.tag_names), sizeof(PosTag))
+ for i, tag in enumerate(self.tag_names):
+ pos, props = tag_map[tag]
+ self.tags[i].id = i
+ self.tags[i].pos = pos
+ self.tags[i].morph.number = props.get('number', 0)
+ self.tags[i].morph.tenspect = props.get('tenspect', 0)
+ self.tags[i].morph.mood = props.get('mood', 0)
+ self.tags[i].morph.gender = props.get('gender', 0)
+ self.tags[i].morph.person = props.get('person', 0)
+ self.tags[i].morph.case = props.get('case', 0)
+ self.tags[i].morph.misc = props.get('misc', 0)
+ if path.exists(path.join(data_dir, 'morphs.json')):
+ with open(path.join(data_dir, 'morphs.json')) as file_:
+ self.load_exceptions(json.load(file_))
+
+ cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
+ if self.lemmatizer is None:
+ return lex.sic
+ if pos != NOUN and pos != VERB and pos != ADJ:
+ return lex.sic
+ cdef bytes py_string = self.strings[lex.sic]
+ cdef set lemma_strings
+ cdef bytes lemma_string
+ if pos == NOUN:
+ lemma_strings = self.lemmatizer.noun(py_string)
+ elif pos == VERB:
+ lemma_strings = self.lemmatizer.verb(py_string)
+ else:
+ assert pos == ADJ
+ lemma_strings = self.lemmatizer.adj(py_string)
+ lemma_string = sorted(lemma_strings)[0]
+ lemma = self.strings.intern(lemma_string, len(lemma_string)).i
+ return lemma
+
+ cdef int set_morph(self, const int i, TokenC* tokens) except -1:
+ cdef const PosTag* tag = &self.tags[tokens[i].pos]
+ cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
+ if cached is NULL:
+ cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
+ cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
+ cached.morph = tag.morph
+ self._cache.set(tag.id, tokens[i].lex.sic, cached)
+
+ tokens[i].lemma = cached.lemma
+ tokens[i].morph = cached.morph
+
+ def load_exceptions(self, dict exc):
+ cdef unicode pos_str
+ cdef unicode form_str
+ cdef unicode lemma_str
+ cdef dict entries
+ cdef dict props
+ cdef int lemma
+ cdef id_t sic
+ cdef univ_tag_t pos
+ for pos_str, entries in exc.items():
+ pos = self.tag_names.index(pos_str)
+ for form_str, props in entries.items():
+ lemma_str = props.get('L', form_str)
+ sic = self.strings[form_str]
+ cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
+ cached.lemma = self.strings[lemma_str]
+ set_morph_from_dict(&cached.morph, props)
+ self._cache.set(pos, sic, cached)
+
+
+cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
+ morph.number = props.get('number', 0)
+ morph.tenspect = props.get('tenspect', 0)
+ morph.mood = props.get('mood', 0)
+ morph.gender = props.get('gender', 0)
+ morph.person = props.get('person', 0)
+ morph.case = props.get('case', 0)
+ morph.misc = props.get('misc', 0)
diff --git a/spacy/orth.py b/spacy/orth.py
index 0462d15df..2400b38a6 100644
--- a/spacy/orth.py
+++ b/spacy/orth.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import unicodedata
from unidecode import unidecode
+import re
import math
diff --git a/spacy/pos_util.py b/spacy/pos_util.py
index e5716665e..489f03dde 100644
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@@ -147,6 +147,7 @@ Y PRT
Z NOUN
^ NOUN
~ X
-`` .""".strip().split('\n'))
+`` .
+EOL EOL""".strip().split('\n'))
return mapping[tag]
diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index 11d8d2a4c..33732f987 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -1,34 +1,23 @@
+from libc.stdint cimport uint8_t
+
from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
-from .typedefs cimport hash_t
-from .context cimport Slots
+from preshed.maps cimport PreshMapArray
+
+from .typedefs cimport hash_t, id_t
from .tokens cimport Tokens
-cpdef enum TagType:
- POS
- ENTITY
- SENSE
-
-
cdef class Tagger:
- cpdef int set_tags(self, Tokens tokens) except -1
- cpdef class_t predict(self, int i, Tokens tokens) except 0
- cpdef int tell_answer(self, list gold) except -1
+ cdef class_t predict(self, const atom_t* context, object golds=*) except *
cpdef readonly Pool mem
cpdef readonly Extractor extractor
cpdef readonly LinearModel model
- cpdef readonly TagType tag_type
cpdef readonly list tag_names
-
- cdef class_t _guess
- cdef atom_t* _context
- cdef feat_t* _feats
- cdef weight_t* _values
- cdef weight_t* _scores
+ cdef dict tagdict
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 428814f70..9890e95e1 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -1,5 +1,4 @@
# cython: profile=True
-from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
@@ -10,155 +9,59 @@ import random
import json
import cython
-
-from .context cimport fill_context
-from .context cimport N_FIELDS
-
-from thinc.features cimport ConjFeat
+from thinc.features cimport Feature, count_feats
-NULL_TAG = 0
-
-
-def setup_model_dir(tag_type, tag_names, templates, model_dir):
+def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
- 'tag_type': tag_type,
'templates': templates,
'tag_names': tag_names,
+ 'tag_map': tag_map,
+ 'tag_counts': tag_counts,
}
with open(path.join(model_dir, 'config.json'), 'w') as file_:
json.dump(config, file_)
-def train(train_sents, model_dir, nr_iter=10):
- cdef Tokens tokens
- tagger = Tagger(model_dir)
- for _ in range(nr_iter):
- n_corr = 0
- total = 0
- for tokens, golds in train_sents:
- assert len(tokens) == len(golds), [t.string for t in tokens]
- for i in range(tokens.length):
- if tagger.tag_type == POS:
- gold = _get_gold_pos(i, golds, tokens.pos)
- elif tagger.tag_type == ENTITY:
- gold = _get_gold_ner(i, golds, tokens.ner)
- guess = tagger.predict(i, tokens)
- tokens.set_tag(i, tagger.tag_type, guess)
- if gold is not None:
- tagger.tell_answer(gold)
- total += 1
- n_corr += guess in gold
- #print('%s\t%d\t%d' % (tokens[i].string, guess, gold))
- print('%.4f' % ((n_corr / total) * 100))
- random.shuffle(train_sents)
- tagger.model.end_training()
- tagger.model.dump(path.join(model_dir, 'model'))
-
-
-cdef object _get_gold_pos(i, golds, int* pred):
- if golds[i] == 0:
- return None
- else:
- return [golds[i]]
-
-
-cdef object _get_gold_ner(i, golds, int* ner):
- if golds[i] == 0:
- return None
- else:
- return [golds[i]]
-
-
-def evaluate(tagger, sents):
- n_corr = 0
- total = 0
- for tokens, golds in sents:
- for i, gold in enumerate(golds):
- guess = tagger.predict(i, tokens)
- tokens.set_tag(i, tagger.tag_type, guess)
- if gold != NULL_TAG:
- total += 1
- n_corr += guess == gold
- return n_corr / total
-
-
cdef class Tagger:
- """Assign part-of-speech, named entity or supersense tags, using greedy
- decoding. The tagger reads its model and configuration from disk.
+ """Predict some type of tag, using greedy decoding. The tagger reads its
+ model and configuration from disk.
"""
def __init__(self, model_dir):
self.mem = Pool()
cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates']
+ univ_counts = {}
+ cdef unicode tag
+ cdef unicode univ_tag
self.tag_names = cfg['tag_names']
- self.tag_type = cfg['tag_type']
- self.extractor = Extractor(templates, [ConjFeat] * len(templates))
- self.model = LinearModel(len(self.tag_names))
+ self.tagdict = _make_tag_dict(cfg['tag_counts'])
+ self.extractor = Extractor(templates)
+ self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
- self._context = self.mem.alloc(N_FIELDS, sizeof(atom_t))
- self._feats = self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
- self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
- self._scores = self.mem.alloc(self.model.nr_class, sizeof(weight_t))
- self._guess = NULL_TAG
-
- cpdef int set_tags(self, Tokens tokens) except -1:
- """Assign tags to a Tokens object.
-
- >>> tokens = EN.tokenize(u'An example sentence.')
- >>> assert tokens[0].pos == 'NO_TAG'
- >>> EN.pos_tagger.set_tags(tokens)
- >>> assert tokens[0].pos == 'DT'
- """
- cdef int i
- for i in range(tokens.length):
- tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
-
- cpdef class_t predict(self, int i, Tokens tokens) except 0:
- """Predict the tag of tokens[i]. The tagger remembers the features and
- prediction, in case you later call tell_answer.
+ cdef class_t predict(self, atom_t* context, object golds=None) except *:
+ """Predict the tag of tokens[i].
>>> tokens = EN.tokenize(u'An example sentence.')
>>> tag = EN.pos_tagger.predict(0, tokens)
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
"""
- fill_context(self._context, i, tokens)
- self.extractor.extract(self._feats, self._values, self._context, NULL)
- self._guess = self.model.score(self._scores, self._feats, self._values)
- return self._guess
-
- cpdef int tell_answer(self, list golds) except -1:
- """Provide the correct tag for the word the tagger was last asked to predict.
- During Tagger.predict, the tagger remembers the features and prediction
- for the example. These are used to calculate a weight update given the
- correct label.
-
- >>> tokens = EN.tokenize('An example sentence.')
- >>> guess = EN.pos_tagger.predict(1, tokens)
- >>> JJ = EN.pos_tagger.tag_id('JJ')
- >>> JJ
- 7
- >>> EN.pos_tagger.tell_answer(JJ)
- """
- cdef class_t guess = self._guess
- if guess in golds:
- self.model.update({})
- return 0
- best_gold = golds[0]
- best_score = self._scores[best_gold-1]
- for gold in golds[1:]:
- if self._scores[gold-1] > best_gold:
- best_score = self._scores[best_gold-1]
- best_gold = gold
- counts = {guess: {}, best_gold: {}}
- self.extractor.count(counts[best_gold], self._feats, 1)
- self.extractor.count(counts[guess], self._feats, -1)
- self.model.update(counts)
+ cdef int n_feats
+ cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
+ cdef weight_t* scores = self.model.get_scores(feats, n_feats)
+ guess = _arg_max(scores, self.model.nr_class)
+ if golds is not None and guess not in golds:
+ best = _arg_max_among(scores, golds)
+ counts = {guess: {}, best: {}}
+ count_feats(counts[guess], feats, n_feats, -1)
+ count_feats(counts[best], feats, n_feats, 1)
+ self.model.update(counts)
+ return guess
def tag_id(self, object tag_name):
"""Encode tag_name into a tag ID integer."""
@@ -167,3 +70,41 @@ cdef class Tagger:
tag_id = len(self.tag_names)
self.tag_names.append(tag_name)
return tag_id
+
+
+def _make_tag_dict(counts):
+ freq_thresh = 20
+ ambiguity_thresh = 0.97
+ tagdict = {}
+ cdef atom_t word
+ cdef atom_t tag
+ for word_str, tag_freqs in counts.items():
+ tag_str, mode = max(tag_freqs.items(), key=lambda item: item[1])
+ n = sum(tag_freqs.values())
+ word = int(word_str)
+ tag = int(tag_str)
+ if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
+ tagdict[word] = tag
+ return tagdict
+
+
+cdef class_t _arg_max(weight_t* scores, int n_classes) except 9000:
+ cdef int best = 0
+ cdef weight_t score = scores[best]
+ cdef int i
+ for i in range(1, n_classes):
+ if scores[i] >= score:
+ score = scores[i]
+ best = i
+ return best
+
+
+cdef class_t _arg_max_among(weight_t* scores, list classes):
+ cdef int best = classes[0]
+ cdef weight_t score = scores[best]
+ cdef class_t clas
+ for clas in classes:
+ if scores[clas] > score:
+ score = scores[clas]
+ best = clas
+ return best
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index d1b2ef10b..43aa7b442 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -1,40 +1,55 @@
+import numpy as np
+cimport numpy as np
+
from cymem.cymem cimport Pool
+from thinc.typedefs cimport atom_t
from .lexeme cimport Lexeme
-from .typedefs cimport flag_t
-from .utf8string cimport StringStore
-from .tagger cimport TagType
-from thinc.typedefs cimport atom_t
+from .typedefs cimport flags_t
+from .typedefs cimport Morphology
+from .lang cimport Language
+
+
+
+cdef struct TokenC:
+ const Lexeme* lex
+ Morphology morph
+ int idx
+ int pos
+ int lemma
+ int sense
+
+
+ctypedef const Lexeme* const_Lexeme_ptr
+ctypedef TokenC* TokenC_ptr
+
+ctypedef fused LexemeOrToken:
+ const_Lexeme_ptr
+ TokenC_ptr
cdef class Tokens:
cdef Pool mem
- cdef StringStore _string_store
+ cdef Language lang
+ cdef list tag_names
- cdef Lexeme** _lex_ptr
- cdef int* _idx_ptr
- cdef int* _pos_ptr
- cdef int* _ner_ptr
- cdef Lexeme** lex
- cdef int* idx
- cdef int* pos
- cdef int* ner
+ cdef TokenC* data
cdef int length
cdef int max_length
- cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
- cdef int push_back(self, int i, Lexeme* lexeme) except -1
- cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1
+ cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
+
+ cpdef np.ndarray[long, ndim=2] get_array(self, list features)
cdef class Token:
- cdef StringStore _string_store
+ cdef public Language lang
cdef public int i
cdef public int idx
- cdef public int pos
- cdef public int ner
+ cdef int pos
+ cdef int lemma
cdef public atom_t id
cdef public atom_t cluster
@@ -51,4 +66,4 @@ cdef class Token:
cdef public float prob
- cdef public flag_t flags
+ cdef public flags_t flags
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 721e6bb80..617feb269 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -1,7 +1,15 @@
# cython: profile=True
+from preshed.maps cimport PreshMap
+from preshed.counter cimport PreshCounter
+
from .lexeme cimport *
cimport cython
-from .tagger cimport POS, ENTITY
+
+import numpy as np
+cimport numpy as np
+
+POS = 0
+ENTITY = 0
DEF PADDING = 5
@@ -17,23 +25,13 @@ cdef class Tokens:
"""A sequence of references to Lexeme objects.
The Tokens class provides fast and memory-efficient access to lexical features,
- and can efficiently export the data to a numpy array. Specific languages
- create their own Tokens subclasses, to provide more convenient access to
- language-specific features.
+ and can efficiently export the data to a numpy array.
>>> from spacy.en import EN
>>> tokens = EN.tokenize('An example sentence.')
- >>> tokens.string(0)
- 'An'
- >>> tokens.prob(0) > tokens.prob(1)
- True
- >>> tokens.can_noun(0)
- False
- >>> tokens.can_noun(1)
- True
"""
- def __init__(self, StringStore string_store, string_length=0):
- self._string_store = string_store
+ def __init__(self, Language lang, string_length=0):
+ self.lang = lang
if string_length >= 3:
size = int(string_length / 3.0)
else:
@@ -42,28 +40,18 @@ cdef class Tokens:
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
# However, we need to remember the true starting places, so that we can
# realloc.
- self._lex_ptr = self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
- self._idx_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int))
- self._pos_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int))
- self._ner_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int))
- self.lex = self._lex_ptr
- self.idx = self._idx_ptr
- self.pos = self._pos_ptr
- self.ner = self._ner_ptr
+ data_start = self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
cdef int i
for i in range(size + (PADDING*2)):
- self.lex[i] = &EMPTY_LEXEME
- self.lex += PADDING
- self.idx += PADDING
- self.pos += PADDING
- self.ner += PADDING
+ data_start[i].lex = &EMPTY_LEXEME
+ self.data = data_start + PADDING
self.max_length = size
self.length = 0
def __getitem__(self, i):
bounds_check(i, self.length, PADDING)
- return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
- self.lex[i][0])
+ return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
+ self.data[i].lemma, self.data[i].lex[0])
def __iter__(self):
for i in range(self.length):
@@ -72,70 +60,78 @@ cdef class Tokens:
def __len__(self):
return self.length
- cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
+ cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
if self.length == self.max_length:
self._realloc(self.length * 2)
- self.lex[self.length] = lexeme
- self.idx[self.length] = idx
- self.pos[self.length] = 0
- self.ner[self.length] = 0
- self.length += 1
- return idx + lexeme.length
-
- cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
- cdef int i
- if lexemes == NULL:
- return idx
- elif n == 0:
- i = 0
- while lexemes[i] != NULL:
- idx = self.push_back(idx, lexemes[i])
- i += 1
+ cdef TokenC* t = &self.data[self.length]
+ if LexemeOrToken is TokenC_ptr:
+ t[0] = lex_or_tok[0]
else:
- for i in range(n):
- idx = self.push_back(idx, lexemes[i])
- return idx
+ t.lex = lex_or_tok
+ self.length += 1
+ return idx + t.lex.length
- cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
- if tag_type == POS:
- self.pos[i] = tag
- elif tag_type == ENTITY:
- self.ner[i] = tag
+ @cython.boundscheck(False)
+ cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
+ cdef int i, j
+ cdef attr_id_t feature
+ cdef np.ndarray[long, ndim=2] output
+ output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
+ for i in range(self.length):
+ for j, feature in enumerate(attr_ids):
+ output[i, j] = get_attr(self.data[i].lex, feature)
+ return output
+
+ def count_by(self, attr_id_t attr_id):
+ cdef int i
+ cdef attr_t attr
+ cdef size_t count
+
+ cdef PreshCounter counts = PreshCounter(2 ** 8)
+ for i in range(self.length):
+ if attr_id == LEMMA:
+ attr = self.data[i].lemma
+ else:
+ attr = get_attr(self.data[i].lex, attr_id)
+ counts.inc(attr, 1)
+ return dict(counts)
def _realloc(self, new_size):
self.max_length = new_size
n = new_size + (PADDING * 2)
- self._lex_ptr = self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
- self._idx_ptr = self.mem.realloc(self._idx_ptr, n * sizeof(int))
- self._pos_ptr = self.mem.realloc(self._pos_ptr, n * sizeof(int))
- self._ner_ptr = self.mem.realloc(self._ner_ptr, n * sizeof(int))
- self.lex = self._lex_ptr + PADDING
- self.idx = self._idx_ptr + PADDING
- self.pos = self._pos_ptr + PADDING
- self.ner = self._ner_ptr + PADDING
+ # What we're storing is a "padded" array. We've jumped forward PADDING
+ # places, and are storing the pointer to that. This way, we can access
+ # words out-of-bounds, and get out-of-bounds markers.
+ # Now that we want to realloc, we need the address of the true start,
+ # so we jump the pointer back PADDING places.
+ cdef TokenC* data_start = self.data - PADDING
+ data_start = self.mem.realloc(data_start, n * sizeof(TokenC))
+ self.data = data_start + PADDING
+ cdef int i
for i in range(self.length, self.max_length + PADDING):
- self.lex[i] = &EMPTY_LEXEME
+ self.data[i].lex = &EMPTY_LEXEME
@cython.freelist(64)
cdef class Token:
- def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
- dict lex):
- self._string_store = string_store
+ def __init__(self, Language lang, int i, int idx,
+ int pos, int lemma, dict lex):
+ self.lang = lang
self.idx = idx
self.pos = pos
- self.ner = ner
self.i = i
self.id = lex['id']
+
+ self.lemma = lemma
self.cluster = lex['cluster']
self.length = lex['length']
- self.postype = lex['postype']
- self.sensetype = lex['supersense']
+ self.postype = lex['pos_type']
+ self.sensetype = 0
self.sic = lex['sic']
- self.norm = lex['norm']
+ self.norm = lex['dense']
self.shape = lex['shape']
- self.suffix = lex['asciied']
+ self.suffix = lex['suffix']
self.prefix = lex['prefix']
self.prob = lex['prob']
@@ -145,5 +141,16 @@ cdef class Token:
def __get__(self):
if self.sic == 0:
return ''
- cdef bytes utf8string = self._string_store[self.sic]
+ cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
return utf8string.decode('utf8')
+
+ property lemma:
+ def __get__(self):
+ if self.lemma == 0:
+ return self.string
+ cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
+ return utf8string.decode('utf8')
+
+ property pos:
+ def __get__(self):
+ return self.lang.pos_tagger.tag_names[self.pos]
diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd
index 21818f05e..02d327b72 100644
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@@ -1,8 +1,20 @@
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
+from libc.stdint cimport uint8_t
ctypedef uint64_t hash_t
ctypedef char* utf8_t
-ctypedef uint64_t flag_t
+ctypedef uint32_t attr_t
+ctypedef uint64_t flags_t
ctypedef uint32_t id_t
ctypedef uint16_t len_t
ctypedef uint16_t tag_t
+
+
+cdef struct Morphology:
+ uint8_t number
+ uint8_t tenspect # Tense/aspect/voice
+ uint8_t mood
+ uint8_t gender
+ uint8_t person
+ uint8_t case
+ uint8_t misc
diff --git a/spacy/utf8string.pxd b/spacy/utf8string.pxd
index 82ae50022..5ef4113d5 100644
--- a/spacy/utf8string.pxd
+++ b/spacy/utf8string.pxd
@@ -1,5 +1,6 @@
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
+from murmurhash.mrmr cimport hash64
from .typedefs cimport utf8_t, id_t, hash_t
@@ -11,11 +12,23 @@ cdef struct Utf8Str:
int length
+cdef struct UniStr:
+ Py_UNICODE* chars
+ size_t n
+ hash_t key
+
+
+cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
+ s.chars = &chars[start]
+ s.n = end - start
+ s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
+
+
cdef class StringStore:
cdef Pool mem
- cdef PreshMap table
+ cdef PreshMap _map
cdef Utf8Str* strings
cdef int size
cdef int _resize_at
- cdef Utf8Str* intern(self, char* chars, int length) except NULL
+ cdef const Utf8Str* intern(self, char* chars, int length) except NULL
diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx
index 18d4a4e5e..1d2b7a264 100644
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@@ -5,10 +5,11 @@ import codecs
SEPARATOR = '\n|-SEP-|\n'
+
cdef class StringStore:
def __init__(self):
self.mem = Pool()
- self.table = PreshMap()
+ self._map = PreshMap()
self._resize_at = 10000
self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
@@ -17,26 +18,30 @@ cdef class StringStore:
def __get__(self):
return self.size-1
- def __getitem__(self, string_or_id):
+ def __getitem__(self, object string_or_id):
cdef bytes byte_string
- cdef Utf8Str* utf8str
- if type(string_or_id) == int or type(string_or_id) == long:
+ cdef const Utf8Str* utf8str
+ if isinstance(string_or_id, int) or isinstance(string_or_id, long):
if string_or_id < 1 or string_or_id >= self.size:
raise IndexError(string_or_id)
utf8str = &self.strings[string_or_id]
return utf8str.chars[:utf8str.length]
- elif type(string_or_id) == bytes:
+ elif isinstance(string_or_id, bytes):
utf8str = self.intern(string_or_id, len(string_or_id))
return utf8str.i
+ elif isinstance(string_or_id, unicode):
+ byte_string = string_or_id.encode('utf8')
+ utf8str = self.intern(byte_string, len(byte_string))
+ return utf8str.i
else:
raise TypeError(type(string_or_id))
- cdef Utf8Str* intern(self, char* chars, int length) except NULL:
+ cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
# 0 means missing, but we don't bother offsetting the index. We waste
# slot 0 to simplify the code, because it doesn't matter.
assert length != 0
cdef hash_t key = hash64(chars, length * sizeof(char), 0)
- cdef void* value = self.table.get(key)
+ cdef void* value = self._map.get(key)
cdef size_t i
if value == NULL:
if self.size == self._resize_at:
@@ -48,7 +53,7 @@ cdef class StringStore:
self.strings[i].chars = self.mem.alloc(length, sizeof(char))
memcpy(self.strings[i].chars, chars, length)
self.strings[i].length = length
- self.table.set(key, self.size)
+ self._map.set(key, self.size)
self.size += 1
else:
i = value
diff --git a/spacy/util.py b/spacy/util.py
index 5062ca6db..1c25aeaf2 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
def read_lang_data(name):
data_dir = path.join(DATA_DIR, name)
- tokenization = read_tokenization(name)
+ with open(path.join(data_dir, 'specials.json')) as file_:
+ tokenization = ujson.load(file_)
prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir)
infix = read_infix(data_dir)
@@ -26,12 +27,14 @@ def read_prefix(data_dir):
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return expression
+
def read_suffix(data_dir):
- with utf8open(path.join(data_dir, 'suffix')) as file_:
+ with utf8open(path.join(data_dir, 'suffix')) as file_:
entries = file_.read().split('\n')
- expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
+ expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
return expression
+
def read_infix(data_dir):
with utf8open(path.join(data_dir, 'infix')) as file_:
entries = file_.read().split('\n')
diff --git a/tests/test_ner.py b/tests/depr_test_ner.py
similarity index 100%
rename from tests/test_ner.py
rename to tests/depr_test_ner.py
diff --git a/tests/test_contractions.py b/tests/test_contractions.py
index b7347a617..1e697afd2 100644
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@@ -20,15 +20,18 @@ def test_apostrophe():
def test_LL():
tokens = EN.tokenize("we'll")
assert len(tokens) == 2
- assert tokens[1].string == "will"
+ assert tokens[1].string == "'ll"
+ assert tokens[1].lemma == "will"
assert tokens[0].string == "we"
def test_aint():
tokens = EN.tokenize("ain't")
assert len(tokens) == 2
- assert tokens[0].string == "are"
- assert tokens[1].string == "not"
+ assert tokens[0].string == "ai"
+ assert tokens[0].lemma == "be"
+ assert tokens[1].string == "n't"
+ assert tokens[1].lemma == "not"
def test_capitalized():
@@ -38,4 +41,12 @@ def test_capitalized():
assert len(tokens) == 2
tokens = EN.tokenize("Ain't")
assert len(tokens) == 2
- assert tokens[0].string == "Are"
+ assert tokens[0].string == "Ai"
+ assert tokens[0].lemma == "be"
+
+
+def test_punct():
+ tokens = EN.tokenize("We've")
+ assert len(tokens) == 2
+ tokens = EN.tokenize("``We've")
+ assert len(tokens) == 3
diff --git a/tests/test_emoticons.py b/tests/test_emoticons.py
index 6bb58e661..143be607d 100644
--- a/tests/test_emoticons.py
+++ b/tests/test_emoticons.py
@@ -27,3 +27,9 @@ def test_tweebo_challenge():
assert tokens[19].string == '")'
assert tokens[20].string == ':>'
assert tokens[21].string == '....'
+
+
+def test_false_positive():
+ text = "example:)"
+ tokens = EN.tokenize(text)
+ assert len(tokens) == 3
diff --git a/tests/test_intern.py b/tests/test_intern.py
index 63b4b3433..a7a801b05 100644
--- a/tests/test_intern.py
+++ b/tests/test_intern.py
@@ -19,8 +19,12 @@ def test_save_bytes(sstore):
def test_save_unicode(sstore):
- with pytest.raises(TypeError):
- A_i = sstore['A']
+ Hello_i = sstore[u'Hello']
+ assert Hello_i == 1
+ assert sstore[u'Hello'] == 1
+ assert sstore[u'goodbye'] != Hello_i
+ assert sstore[u'hello'] != Hello_i
+ assert Hello_i == 1
def test_zero_id(sstore):
diff --git a/tests/test_iter_lexicon.py b/tests/test_iter_lexicon.py
new file mode 100644
index 000000000..379ebd3bb
--- /dev/null
+++ b/tests/test_iter_lexicon.py
@@ -0,0 +1,15 @@
+import pytest
+
+from spacy.en import EN
+
+def test_range_iter():
+ EN.load()
+ for i in range(len(EN.lexicon)):
+ lex = EN.lexicon[i]
+
+
+def test_iter():
+ EN.load()
+ i = 0
+ for lex in EN.lexicon:
+ i += 1
diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py
new file mode 100644
index 000000000..2047e4d2c
--- /dev/null
+++ b/tests/test_lemmatizer.py
@@ -0,0 +1,34 @@
+from spacy.lemmatizer import Lemmatizer, read_index, read_exc
+from spacy.util import DATA_DIR
+from os import path
+
+import pytest
+
+
+def test_read_index():
+ wn = path.join(DATA_DIR, 'wordnet')
+ index = read_index(path.join(wn, 'index.noun'))
+ assert 'man' in index
+ assert 'plantes' not in index
+ assert 'plant' in index
+
+
+def test_read_exc():
+ wn = path.join(DATA_DIR, 'wordnet')
+ exc = read_exc(path.join(wn, 'verb.exc'))
+ assert exc['was'] == ('be',)
+
+
+@pytest.fixture
+def lemmatizer():
+ return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
+
+
+def test_noun_lemmas(lemmatizer):
+ do = lemmatizer.noun
+
+ assert do('aardwolves') == set(['aardwolf'])
+ assert do('aardwolf') == set(['aardwolf'])
+ assert do('planets') == set(['planet'])
+ assert do('ring') == set(['ring'])
+ assert do('axes') == set(['axis', 'axe', 'ax'])
diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py
index 10276d8ea..c1fe2d847 100644
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@@ -7,6 +7,7 @@ from spacy.lexeme import *
def test_is_alpha():
+ EN.load()
the = EN.lexicon['the']
assert the['flags'] & (1 << IS_ALPHA)
year = EN.lexicon['1999']
@@ -16,6 +17,7 @@ def test_is_alpha():
def test_is_digit():
+ EN.load()
the = EN.lexicon['the']
assert not the['flags'] & (1 << IS_DIGIT)
year = EN.lexicon['1999']
diff --git a/tests/test_rules.py b/tests/test_rules.py
deleted file mode 100644
index b19a1c3f1..000000000
--- a/tests/test_rules.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from spacy import util
-
-
-def test_load_en():
- rules = util.read_tokenization('en')
- assert len(rules) != 0
- aint = [rule for rule in rules if rule[0] == "ain't"][0]
- chunk, pieces = aint
- assert chunk == "ain't"
- assert pieces[0] == "are"
- assert pieces[1] == "not"
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index fb5f78ed7..21d115b9b 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -34,7 +34,7 @@ def test_digits():
def test_contraction():
tokens = EN.tokenize("don't giggle")
assert len(tokens) == 3
- assert tokens[1].sic == EN.lexicon["not"]['sic']
+ assert tokens[1].sic == EN.lexicon["n't"]['sic']
tokens = EN.tokenize("i said don't!")
assert len(tokens) == 5
assert tokens[4].sic == EN.lexicon['!']['sic']
@@ -71,30 +71,39 @@ def test_cnts1():
tokens = EN.tokenize(text)
assert len(tokens) == 8
+
def test_cnts2():
text = u"""U.N. regulations are not a part of their concern."""
tokens = EN.tokenize(text)
assert len(tokens) == 10
+
def test_cnts3():
text = u"“Isn't it?”"
tokens = EN.tokenize(text)
- assert len(tokens) == 6
+ words = [t.string for t in tokens]
+ assert len(words) == 6
+
def test_cnts4():
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
tokens = EN.tokenize(text)
- assert len(tokens) == 15
+ words = [t.string for t in tokens]
+ assert len(words) == 15
+
def test_cnts5():
text = """'Me too!', Mr. P. Delaware cried. """
tokens = EN.tokenize(text)
assert len(tokens) == 11
+
def test_cnts6():
text = u'They ran about 10km.'
tokens = EN.tokenize(text)
- assert len(tokens) == 6
+ words = [t.string for t in tokens]
+ assert len(words) == 6
+
#def test_cnts7():
# text = 'But then the 6,000-year ice age came...'