mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Merge setup.py
This commit is contained in:
commit
ca54d58638
|
@ -11,3 +11,8 @@ $
|
|||
'
|
||||
``
|
||||
`
|
||||
#
|
||||
US$
|
||||
C$
|
||||
A$
|
||||
a-
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
,
|
||||
"
|
||||
)
|
||||
]
|
||||
}
|
||||
*
|
||||
!
|
||||
?
|
||||
\"
|
||||
\)
|
||||
\]
|
||||
\}
|
||||
\*
|
||||
\!
|
||||
\?
|
||||
%
|
||||
$
|
||||
\$
|
||||
>
|
||||
:
|
||||
;
|
||||
|
@ -16,7 +16,8 @@ $
|
|||
''
|
||||
's
|
||||
'S
|
||||
.
|
||||
..
|
||||
...
|
||||
....
|
||||
\.\.
|
||||
\.\.\.
|
||||
\.\.\.\.
|
||||
(?<=[a-z0-9])\.
|
||||
(?<=[0-9])km
|
||||
|
|
|
@ -4,101 +4,9 @@
|
|||
#*---* ---
|
||||
#*'s 's
|
||||
|
||||
's 's
|
||||
'S 'S
|
||||
ain't are not
|
||||
aren't are not
|
||||
can't can not
|
||||
cannot can not
|
||||
could've could have
|
||||
couldn't could not
|
||||
couldn't've could not have
|
||||
didn't did not
|
||||
doesn't does not
|
||||
don't do not
|
||||
hadn't had not
|
||||
hadn't've had not have
|
||||
hasn't has not
|
||||
haven't have not
|
||||
he'd he would
|
||||
he'd've he would have
|
||||
he'll he will
|
||||
he's he 's
|
||||
how'd he would
|
||||
how'll he will
|
||||
how's how 's
|
||||
I'd I would
|
||||
I'd've I would have
|
||||
I'll I will
|
||||
I'm I am
|
||||
I'ma I will
|
||||
I've I have
|
||||
isn't is not
|
||||
it'd it would
|
||||
it'd've it would have
|
||||
it'll it will
|
||||
it's it 's
|
||||
let's let 's
|
||||
mightn't might not
|
||||
mightn't've might not have
|
||||
might've might have
|
||||
mustn't must not
|
||||
must've must have
|
||||
needn't need not
|
||||
not've not have
|
||||
shan't shall not
|
||||
she'd she would
|
||||
she'd've she would have
|
||||
she'll she will
|
||||
she's she 's
|
||||
should've should have
|
||||
shouldn't should not
|
||||
shouldn't've should not have
|
||||
that's that 's
|
||||
there'd there would
|
||||
there'd've there would have
|
||||
there's there is
|
||||
they'd there would
|
||||
they'd've they would have
|
||||
they'll they will
|
||||
they're they are
|
||||
they've they have
|
||||
wasn't was not
|
||||
we'd we would
|
||||
we'd've we would have
|
||||
we'll we will
|
||||
we're we are
|
||||
we've we have
|
||||
weren't were not
|
||||
what'll what will
|
||||
what're what are
|
||||
what's what 's
|
||||
what've what have
|
||||
when's when 's
|
||||
where'd where would
|
||||
where's where 's
|
||||
where've where have
|
||||
who'd who would
|
||||
who'll who will
|
||||
who're who are
|
||||
who's who 's
|
||||
who've who have
|
||||
why'll who will
|
||||
why're why are
|
||||
why's why 's
|
||||
won't will not
|
||||
would've would have
|
||||
wouldn't would not
|
||||
wouldn't've would not have
|
||||
you'd you would
|
||||
you'd've you would have
|
||||
you'll you will
|
||||
you're you are
|
||||
you've you have
|
||||
'em them
|
||||
'ol old
|
||||
10km 10 km
|
||||
U.S. U.S.
|
||||
U.K. U.K.
|
||||
non-U.S. non-U.S.
|
||||
U.N. U.N.
|
||||
Co. Co.
|
||||
|
@ -115,7 +23,12 @@ A.G. A.G.
|
|||
Rep. Rep.
|
||||
Ms. Ms.
|
||||
Mr. Mr.
|
||||
Mrs. Mrs.
|
||||
a.m. a.m.
|
||||
Sen. Sen.
|
||||
INC. INC.
|
||||
CO. CO.
|
||||
COS. COS.
|
||||
p.m. p.m.
|
||||
Nos. Nos.
|
||||
a.k.a. a.k.a.
|
||||
|
@ -127,6 +40,7 @@ E. E.
|
|||
F. F.
|
||||
G. G.
|
||||
H. H.
|
||||
I. I.
|
||||
J. J.
|
||||
K. K.
|
||||
L. L.
|
||||
|
@ -205,6 +119,9 @@ Wash. Wash.
|
|||
W.Va. W.Va.
|
||||
Wis. Wis.
|
||||
Wyo. Wyo.
|
||||
L.A. L.A.
|
||||
R.H. R.H.
|
||||
Gov. Gov.
|
||||
'' ''
|
||||
:) :)
|
||||
<3 <3
|
||||
|
@ -262,3 +179,19 @@ V_V V_V
|
|||
o.O o.O
|
||||
") ")
|
||||
.... ....
|
||||
a- a -
|
||||
Messrs. Messrs.
|
||||
No. No.
|
||||
vs. vs.
|
||||
Gen. Gen.
|
||||
Cos. Cos.
|
||||
L.J. L.J.
|
||||
D.T. D.T.
|
||||
Prof. Prof.
|
||||
Bros. Bros.
|
||||
J.C. J.C.
|
||||
Neb. Neb.
|
||||
Adm. Adm.
|
||||
U.S.S.R. U.S.S.R.
|
||||
Rev. Rev.
|
||||
H.F. H.F.
|
||||
|
|
|
@ -3,45 +3,228 @@
|
|||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
================================
|
||||
spaCy NLP Tokenizer and Lexicon
|
||||
================================
|
||||
|
||||
spaCy is a library for industrial strength NLP in Python. Its core
|
||||
values are:
|
||||
spaCy is a library for industrial-strength NLP in Python and Cython. spaCy's
|
||||
take on NLP is that it's mostly about feature extraction --- that's the part
|
||||
that's specific to NLP, so that's what an NLP library should focus on.
|
||||
|
||||
* **Efficiency**: You won't find faster NLP tools. For shallow analysis, it's 10x
|
||||
faster than Stanford Core NLP, and over 200x faster than NLTK. Its parser is
|
||||
over 100x faster than Stanford's.
|
||||
spaCy also believes that for NLP, **efficiency is critical**. If you're
|
||||
running batch jobs, you probably have an enormous amount of data; if you're
|
||||
serving requests one-by-one, you want lower latency and fewer servers. Even if
|
||||
you're doing exploratory research on relatively small samples, you should still
|
||||
value efficiency, because it means you can run more experiments.
|
||||
|
||||
* **Accuracy**: All spaCy tools are within 0.5% of the current published
|
||||
state-of-the-art, on both news and web text. NLP moves fast, so always check
|
||||
the numbers --- and don't settle for tools that aren't backed by
|
||||
rigorous recent evaluation.
|
||||
Depending on the task, spaCy is between 10 and 200 times faster than NLTK,
|
||||
often with much better accuracy. See Benchmarks for details, and
|
||||
Why is spaCy so fast? for a discussion of the algorithms and implementation
|
||||
that makes this possible.
|
||||
|
||||
* **Minimalism**: This isn't a library that covers 43 known algorithms to do X. You
|
||||
get 1 --- the best one --- with a simple, low-level interface. This keeps the
|
||||
code-base small and concrete. Our Python APIs use lists and
|
||||
dictionaries, and our C/Cython APIs use arrays and simple structs.
|
||||
+---------+----------+-------------+----------+
|
||||
| System | Tokenize | --> Counts | --> Stem |
|
||||
+---------+----------+-------------+----------+
|
||||
| spaCy | 1m42s | 1m59s | 1m59s |
|
||||
+---------+----------+-------------+----------+
|
||||
| NLTK | 20m2s | 28m24s | 52m28 |
|
||||
+---------+----------+-------------+----------+
|
||||
|
||||
Times for 100m words of text.
|
||||
|
||||
|
||||
Unique Lexicon-centric design
|
||||
=============================
|
||||
|
||||
spaCy helps you build models that generalise better, by making it easy to use
|
||||
more robust features. Instead of a list of strings, the tokenizer returns
|
||||
references to rich lexical types. Features which ask about the word's Brown cluster,
|
||||
its typical part-of-speech tag, how it's usually cased etc require no extra effort:
|
||||
|
||||
>>> from spacy.en import EN
|
||||
>>> from spacy.feature_names import *
|
||||
>>> feats = (
|
||||
SIC, # ID of the original word form
|
||||
STEM, # ID of the stemmed word form
|
||||
CLUSTER, # ID of the word's Brown cluster
|
||||
IS_TITLE, # Was the word title-cased?
|
||||
POS_TYPE # A cluster ID describing what POS tags the word is usually assigned
|
||||
)
|
||||
>>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^')
|
||||
>>> tokens.to_array(feats)[:5]
|
||||
array([[ 1, 2, 3, 4],
|
||||
[...],
|
||||
[...],
|
||||
[...]])
|
||||
|
||||
|
||||
spaCy is designed to **make the right thing easy**, where the right thing is to:
|
||||
|
||||
* **Use rich distributional and orthographic features**. Without these, your model
|
||||
will be very brittle and domain dependent.
|
||||
|
||||
* **Compute features per type, not per token**. Because of Zipf's law, you can
|
||||
expect this to be exponentially more efficient.
|
||||
|
||||
* **Minimize string processing**, and instead compute with arrays of ID ints.
|
||||
|
||||
For the current list of lexical features, see `Lexical Features`_.
|
||||
|
||||
Comparison
|
||||
----------
|
||||
.. _lexical features: features.html
|
||||
|
||||
+----------------+-------------+--------+---------------+--------------+
|
||||
| Tokenize & Tag | Speed (w/s) | Memory | % Acc. (news) | % Acc. (web) |
|
||||
+----------------+-------------+--------+---------------+--------------+
|
||||
| spaCy | 107,000 | 1.3gb | 96.7 | |
|
||||
+----------------+-------------+--------+---------------+--------------+
|
||||
| Stanford | 8,000 | 1.5gb | 96.7 | |
|
||||
+----------------+-------------+--------+---------------+--------------+
|
||||
| NLTK | 543 | 61mb | 94.0 | |
|
||||
+----------------+-------------+--------+---------------+--------------+
|
||||
Tokenization done right
|
||||
=======================
|
||||
|
||||
Most tokenizers rely on complicated regular expressions. Often, they leave you
|
||||
with no way to align the tokens back to the original string --- a vital feature
|
||||
if you want to display some mark-up, such as spelling correction. The regular
|
||||
expressions also interact, making it hard to accommodate special cases.
|
||||
|
||||
spaCy introduces a **novel tokenization algorithm** that's much faster and much
|
||||
more flexible:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def tokenize(string, prefixes={}, suffixes={}, specials={}):
|
||||
'''Sketch of spaCy's tokenization algorithm.'''
|
||||
tokens = []
|
||||
cache = {}
|
||||
for chunk in string.split():
|
||||
# Because of Zipf's law, the cache serves the majority of "chunks".
|
||||
if chunk in cache:
|
||||
tokens.extend(cache[chunl])
|
||||
continue
|
||||
key = chunk
|
||||
|
||||
subtokens = []
|
||||
# Process a chunk by splitting off prefixes e.g. ( " { and suffixes e.g. , . :
|
||||
# If we split one off, check whether we're left with a special-case,
|
||||
# e.g. contractions (can't, won't, etc), emoticons, abbreviations, etc.
|
||||
# This makes the tokenization easy to update and customize.
|
||||
while chunk:
|
||||
prefix, chunk = _consume_prefix(chunk, prefixes)
|
||||
if prefix:
|
||||
subtokens.append(prefix)
|
||||
if chunk in specials:
|
||||
subtokens.extend(specials[chunk])
|
||||
break
|
||||
suffix, chunk = _consume_suffix(chunk, suffixes)
|
||||
if suffix:
|
||||
subtokens.append(suffix)
|
||||
if chunk in specials:
|
||||
subtokens.extend(specials[chunk])
|
||||
break
|
||||
cache[key] = subtokens
|
||||
|
||||
Your data is going to have its own quirks, so it's really useful to have
|
||||
a tokenizer you can easily control. To see the limitations of the standard
|
||||
regex-based approach, check out `CMU's recent work on tokenizing tweets <http://www.ark.cs.cmu.edu/TweetNLP/>`_. Despite a lot of careful attention, they can't handle all of their
|
||||
known emoticons correctly --- doing so would interfere with the way they
|
||||
process other punctuation. This isn't a problem for spaCy: we just add them
|
||||
all to the special tokenization rules.
|
||||
|
||||
spaCy's tokenizer is also incredibly efficient:
|
||||
|
||||
spaCy can create an inverted index of the 1.8 billion word Gigaword corpus,
|
||||
in under half an hour --- on a Macbook Air. See the `inverted
|
||||
index tutorial`_.
|
||||
|
||||
.. _inverted index tutorial: index_tutorial.html
|
||||
|
||||
Comparison with NLTK
|
||||
====================
|
||||
|
||||
`NLTK <http://nltk.org>`_ provides interfaces to a wide-variety of NLP
|
||||
tools and resources, and its own implementations of a few algorithms. It comes
|
||||
with comprehensive documentation, and a book introducing concepts in NLP. For
|
||||
these reasons, it's very widely known. However, if you're trying to make money
|
||||
or do cutting-edge research, NLTK is not a good choice.
|
||||
|
||||
The `list of stuff in NLTK <http://www.nltk.org/py-modindex.html>`_ looks impressive,
|
||||
but almost none of it is useful for real work. You're not going to make any money,
|
||||
or do top research, by using the NLTK chat bots, theorem provers, toy CCG implementation,
|
||||
etc. Most of NLTK is there to assist in the explanation ideas in computational
|
||||
linguistics, at roughly an undergraduate level.
|
||||
But it also claims to support serious work, by wrapping external tools.
|
||||
|
||||
In a pretty well known essay, Joel Spolsky discusses the pain of dealing with
|
||||
`leaky abstractions <http://www.joelonsoftware.com/articles/LeakyAbstractions.html>`_.
|
||||
An abstraction tells you to not care about implementation
|
||||
details, but sometimes the implementation matters after all. When it
|
||||
does, you have to waste time revising your assumptions.
|
||||
|
||||
NLTK's wrappers call external tools via subprocesses, and wrap this up so
|
||||
that it looks like a native API. This abstraction leaks *a lot*. The system
|
||||
calls impose far more overhead than a normal Python function call, which makes
|
||||
the most natural way to program against the API infeasible.
|
||||
|
||||
|
||||
Case study: POS tagging
|
||||
-----------------------
|
||||
|
||||
Here's a quick comparison of the following POS taggers:
|
||||
|
||||
* **Stanford (CLI)**: The Stanford POS tagger, invoked once as a batch process
|
||||
from the command-line;
|
||||
* **nltk.tag.stanford**: The Stanford tagger, invoked document-by-document via
|
||||
NLTK's wrapper;
|
||||
* **nltk.pos_tag**: NLTK's own POS tagger, invoked document-by-document.
|
||||
* **spacy.en.pos_tag**: spaCy's POS tagger, invoked document-by-document.
|
||||
|
||||
|
||||
+-------------------+-------------+--------+
|
||||
| System | Speed (w/s) | % Acc. |
|
||||
+-------------------+-------------+--------+
|
||||
| spaCy | 107,000 | 96.7 |
|
||||
+-------------------+-------------+--------+
|
||||
| Stanford (CLI) | 8,000 | 96.7 |
|
||||
+-------------------+-------------+--------+
|
||||
| nltk.pos_tag | 543 | 94.0 |
|
||||
+-------------------+-------------+--------+
|
||||
| nltk.tag.stanford | 209 | 96.7 |
|
||||
+-------------------+-------------+--------+
|
||||
|
||||
Experimental details TODO. Three things are apparent from this comparison:
|
||||
|
||||
1. The native NLTK tagger, nltk.pos_tag, is both slow and inaccurate;
|
||||
|
||||
2. Calling the Stanford tagger document-by-document via NLTK is **40x** slower
|
||||
than invoking the model once as a batch process, via the command-line;
|
||||
|
||||
3. spaCy is over 10x faster than the Stanford tagger, even when called
|
||||
**sentence-by-sentence**.
|
||||
|
||||
The problem is that NLTK simply wraps the command-line
|
||||
interfaces of these tools, so communication is via a subprocess. NLTK does not
|
||||
even hold open a pipe for you --- the model is reloaded, again and again.
|
||||
|
||||
To use the wrapper effectively, you should batch up your text as much as possible.
|
||||
This probably isn't how you would like to structure your pipeline, and you
|
||||
might not be able to batch up much text at all, e.g. if serving a single
|
||||
request means processing a single document.
|
||||
Technically, NLTK does give you Python functions to access lots of different
|
||||
systems --- but, you can't use them as you would expect to use a normal Python
|
||||
function. The abstraction leaks.
|
||||
|
||||
Here's the bottom-line: the Stanford tools are written in Java, so using them
|
||||
from Python sucks. You shouldn't settle for this. It's a problem that springs
|
||||
purely from the tooling, rather than the domain.
|
||||
|
||||
Summary
|
||||
-------
|
||||
|
||||
NLTK is a well-known Python library for NLP, but for the important bits, you
|
||||
don't get actual Python modules. You get wrappers which throw to external
|
||||
tools, via subprocesses. This is not at all the same thing.
|
||||
|
||||
spaCy is implemented in Cython, just like numpy, scikit-learn, lxml and other
|
||||
high-performance Python libraries. So you get a native Python API, but the
|
||||
performance you expect from a program written in C.
|
||||
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
:maxdepth: 3
|
||||
|
||||
what/index.rst
|
||||
why/index.rst
|
||||
how/index.rst
|
||||
|
||||
features.rst
|
||||
license_stories.rst
|
||||
|
|
26
setup.py
26
setup.py
|
@ -10,6 +10,8 @@ import os.path
|
|||
from os import path
|
||||
from glob import glob
|
||||
|
||||
import numpy
|
||||
|
||||
|
||||
def clean(ext):
|
||||
for pyx in ext.sources:
|
||||
|
@ -34,7 +36,7 @@ compile_args = []
|
|||
link_args = []
|
||||
libs = []
|
||||
|
||||
includes = ['.']
|
||||
includes = ['.', numpy.get_include()]
|
||||
cython_includes = ['.']
|
||||
|
||||
|
||||
|
@ -50,18 +52,20 @@ exts = [
|
|||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.morphology", ["spacy/morphology.pyx"], language="c++",
|
||||
include_dirs=includes),
|
||||
#Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -1,66 +0,0 @@
|
|||
from thinc.typedefs cimport atom_t
|
||||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens
|
||||
from .lexeme cimport Lexeme
|
||||
|
||||
|
||||
cdef class Token:
|
||||
cdef readonly atom_t sic
|
||||
cdef readonly atom_t cluster
|
||||
cdef readonly atom_t norm
|
||||
cdef readonly atom_t shape
|
||||
cdef readonly atom_t asciied
|
||||
cdef readonly atom_t prefix
|
||||
cdef readonly atom_t suffix
|
||||
cdef readonly atom_t length
|
||||
|
||||
cdef readonly atom_t postype
|
||||
cdef readonly atom_t nertype
|
||||
cdef readonly atom_t sensetype
|
||||
|
||||
cdef readonly atom_t is_alpha
|
||||
cdef readonly atom_t is_ascii
|
||||
cdef readonly atom_t is_digit
|
||||
cdef readonly atom_t is_lower
|
||||
cdef readonly atom_t is_punct
|
||||
cdef readonly atom_t is_space
|
||||
cdef readonly atom_t is_title
|
||||
cdef readonly atom_t is_upper
|
||||
cdef readonly atom_t like_url
|
||||
cdef readonly atom_t like_number
|
||||
cdef readonly atom_t oft_lower
|
||||
cdef readonly atom_t oft_title
|
||||
cdef readonly atom_t oft_upper
|
||||
|
||||
cdef readonly atom_t in_males
|
||||
cdef readonly atom_t in_females
|
||||
cdef readonly atom_t in_surnames
|
||||
cdef readonly atom_t in_places
|
||||
cdef readonly atom_t in_games
|
||||
cdef readonly atom_t in_celebs
|
||||
cdef readonly atom_t in_names
|
||||
|
||||
cdef readonly atom_t pos
|
||||
cdef readonly atom_t sense
|
||||
cdef readonly atom_t ner
|
||||
|
||||
|
||||
cdef class Slots:
|
||||
cdef readonly Token P4
|
||||
cdef readonly Token P3
|
||||
cdef readonly Token P2
|
||||
cdef readonly Token P1
|
||||
cdef readonly Token N0
|
||||
cdef readonly Token N1
|
||||
cdef readonly Token N2
|
||||
cdef readonly Token N3
|
||||
cdef readonly Token N4
|
||||
|
||||
|
||||
cdef int N_FIELDS
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
|
||||
|
||||
|
||||
cpdef Slots FIELD_IDS
|
|
@ -1,126 +0,0 @@
|
|||
from murmurhash.mrmr cimport hash64
|
||||
from .lexeme cimport *
|
||||
|
||||
|
||||
cdef class Slots:
|
||||
def __init__(self):
|
||||
self.P4 = Token()
|
||||
self.P3 = Token()
|
||||
self.P2 = Token()
|
||||
self.P1 = Token()
|
||||
self.N0 = Token()
|
||||
self.N1 = Token()
|
||||
self.N2 = Token()
|
||||
self.N3 = Token()
|
||||
self.N4 = Token()
|
||||
|
||||
|
||||
cdef void _number_token(Token t, int* n_fields):
|
||||
cdef int i = n_fields[0]
|
||||
t.sic = i; i += 1
|
||||
t.cluster = i; i += 1
|
||||
t.norm = i; i += 1
|
||||
t.shape = i; i += 1
|
||||
t.prefix = i; i += 1
|
||||
t.suffix = i; i += 1
|
||||
t.length = i; i += 1
|
||||
|
||||
t.postype = i; i += 1
|
||||
t.nertype = i; i += 1
|
||||
t.sensetype = i; i += 1
|
||||
|
||||
t.is_alpha = i; i += 1
|
||||
t.is_ascii = i; i += 1
|
||||
t.is_digit = i; i += 1
|
||||
t.is_lower = i; i += 1
|
||||
t.is_punct = i; i += 1
|
||||
t.is_space = i; i += 1
|
||||
t.is_title = i; i += 1
|
||||
t.is_upper = i; i += 1
|
||||
|
||||
t.like_number = i; i += 1
|
||||
t.like_url = i; i += 1
|
||||
|
||||
t.oft_lower = i; i += 1
|
||||
t.oft_title = i; i += 1
|
||||
t.oft_upper = i; i += 1
|
||||
|
||||
t.in_males = i; i += 1
|
||||
t.in_females = i; i += 1
|
||||
t.in_surnames = i; i += 1
|
||||
t.in_places = i; i += 1
|
||||
t.in_games = i; i += 1
|
||||
t.in_celebs = i; i += 1
|
||||
t.in_names = i; i += 1
|
||||
|
||||
t.pos = i; i += 1
|
||||
t.sense = i; i += 1
|
||||
t.ner = i; i += 1
|
||||
|
||||
n_fields[0] = i
|
||||
|
||||
|
||||
cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner):
|
||||
c[t.sic] = lex.sic
|
||||
c[t.cluster] = lex.cluster
|
||||
c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
||||
c[t.shape] = lex.shape
|
||||
c[t.asciied] = lex.asciied
|
||||
c[t.prefix] = lex.prefix
|
||||
c[t.suffix] = lex.suffix
|
||||
c[t.length] = lex.length
|
||||
|
||||
c[t.postype] = lex.postype
|
||||
c[t.nertype] = 0
|
||||
c[t.sensetype] = 0
|
||||
|
||||
c[t.is_alpha] = lex.flags & (1 << IS_ALPHA)
|
||||
c[t.is_digit] = lex.flags & (1 << IS_DIGIT)
|
||||
c[t.is_lower] = lex.flags & (1 << IS_LOWER)
|
||||
c[t.is_punct] = lex.flags & (1 << IS_PUNCT)
|
||||
c[t.is_space] = lex.flags & (1 << IS_SPACE)
|
||||
c[t.is_title] = lex.flags & (1 << IS_TITLE)
|
||||
c[t.is_upper] = lex.flags & (1 << IS_UPPER)
|
||||
c[t.like_url] = lex.flags & (1 << LIKE_URL)
|
||||
c[t.like_number] = lex.flags & (1 << LIKE_NUMBER)
|
||||
c[t.oft_lower] = lex.flags & (1 << OFT_LOWER)
|
||||
c[t.oft_title] = lex.flags & (1 << OFT_TITLE)
|
||||
c[t.oft_upper] = lex.flags & (1 << OFT_UPPER)
|
||||
|
||||
c[t.in_males] = lex.flags & (1 << IN_MALES)
|
||||
c[t.in_females] = lex.flags & (1 << IN_FEMALES)
|
||||
c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES)
|
||||
c[t.in_places] = lex.flags & (1 << IN_PLACES)
|
||||
c[t.in_games] = lex.flags & (1 << IN_GAMES)
|
||||
c[t.in_celebs] = lex.flags & (1 << IN_CELEBS)
|
||||
c[t.in_names] = lex.flags & (1 << IN_NAMES)
|
||||
|
||||
c[t.pos] = pos
|
||||
c[t.sense] = 0
|
||||
c[t.ner] = ner
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
|
||||
_fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4])
|
||||
_fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3])
|
||||
_fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
|
||||
_fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
|
||||
_fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
|
||||
_fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
|
||||
_fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
|
||||
_fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3])
|
||||
_fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4])
|
||||
return 1
|
||||
|
||||
|
||||
N_FIELDS = 0
|
||||
FIELD_IDS = Slots()
|
||||
_number_token(FIELD_IDS.P4, &N_FIELDS)
|
||||
_number_token(FIELD_IDS.P3, &N_FIELDS)
|
||||
_number_token(FIELD_IDS.P2, &N_FIELDS)
|
||||
_number_token(FIELD_IDS.P1, &N_FIELDS)
|
||||
_number_token(FIELD_IDS.N0, &N_FIELDS)
|
||||
_number_token(FIELD_IDS.N1, &N_FIELDS)
|
||||
_number_token(FIELD_IDS.N2, &N_FIELDS)
|
||||
_number_token(FIELD_IDS.N3, &N_FIELDS)
|
||||
_number_token(FIELD_IDS.N4, &N_FIELDS)
|
132
spacy/en.pxd
132
spacy/en.pxd
|
@ -1,5 +1,133 @@
|
|||
from spacy.lang cimport Language
|
||||
from spacy.tokens cimport Tokens
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from .lang cimport Language
|
||||
from .tokens cimport Tokens
|
||||
from .tokens cimport TokenC
|
||||
|
||||
|
||||
cpdef enum en_person_t:
|
||||
NO_PERSON
|
||||
FIRST
|
||||
SECOND
|
||||
THIRD
|
||||
NON_THIRD
|
||||
|
||||
|
||||
cpdef enum en_number_t:
|
||||
NO_NUMBER
|
||||
SINGULAR
|
||||
PLURAL
|
||||
MASS
|
||||
|
||||
|
||||
cpdef enum en_gender_t:
|
||||
NO_GENDER
|
||||
MASCULINE
|
||||
FEMININE
|
||||
NEUTER
|
||||
|
||||
|
||||
cpdef enum en_case_t:
|
||||
NO_CASE
|
||||
NOMINATIVE
|
||||
GENITIVE
|
||||
ACCUSATIVE
|
||||
REFLEXIVE
|
||||
DEMONYM
|
||||
|
||||
|
||||
cpdef enum en_tenspect_t:
|
||||
NO_TENSE
|
||||
BASE_VERB
|
||||
PRESENT
|
||||
PAST
|
||||
PASSIVE
|
||||
ING
|
||||
MODAL
|
||||
|
||||
|
||||
cpdef enum misc_t:
|
||||
NO_MISC
|
||||
COMPARATIVE
|
||||
SUPERLATIVE
|
||||
RELATIVE
|
||||
NAME
|
||||
|
||||
|
||||
# Flags
|
||||
cpdef enum FlagID:
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
IS_LOWER
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_TITLE
|
||||
IS_UPPER
|
||||
|
||||
LIKE_URL
|
||||
LIKE_NUMBER
|
||||
|
||||
OFT_LOWER
|
||||
OFT_TITLE
|
||||
OFT_UPPER
|
||||
|
||||
IN_MALES
|
||||
IN_FEMALES
|
||||
IN_SURNAMES
|
||||
IN_PLACES
|
||||
IN_GAMES
|
||||
IN_CELEBS
|
||||
IN_NAMES
|
||||
|
||||
|
||||
cpdef enum:
|
||||
P2_sic
|
||||
P2_cluster
|
||||
P2_shape
|
||||
P2_prefix
|
||||
P2_suffix
|
||||
P2_pos
|
||||
P2_lemma
|
||||
P2_pos_type
|
||||
|
||||
P1_sic
|
||||
P1_cluster
|
||||
P1_shape
|
||||
P1_prefix
|
||||
P1_suffix
|
||||
P1_pos
|
||||
P1_lemma
|
||||
P1_pos_type
|
||||
|
||||
W_sic
|
||||
W_cluster
|
||||
W_shape
|
||||
W_prefix
|
||||
W_suffix
|
||||
W_pos
|
||||
W_lemma
|
||||
W_pos_type
|
||||
|
||||
N1_sic
|
||||
N1_cluster
|
||||
N1_shape
|
||||
N1_prefix
|
||||
N1_suffix
|
||||
N1_pos
|
||||
N1_lemma
|
||||
N1_pos_type
|
||||
|
||||
N2_sic
|
||||
N2_cluster
|
||||
N2_shape
|
||||
N2_prefix
|
||||
N2_suffix
|
||||
N2_pos
|
||||
N2_lemma
|
||||
N2_pos_type
|
||||
|
||||
N_CONTEXT_FIELDS
|
||||
|
||||
|
||||
cdef class English(Language):
|
||||
|
|
160
spacy/en.pyx
160
spacy/en.pyx
|
@ -30,14 +30,101 @@ same scheme. Tokenization problems are a major cause of poor performance for
|
|||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
||||
provides a fully Penn Treebank 3-compliant tokenizer.
|
||||
'''
|
||||
# TODO
|
||||
#The script translate_treebank_tokenization can be used to transform a treebank's
|
||||
#annotation to use one of the spacy tokenization schemes.
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
cimport lang
|
||||
from .typedefs cimport flags_t
|
||||
import orth
|
||||
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||
from .morphology cimport X, PUNCT, EOL
|
||||
|
||||
from .tokens cimport Morphology
|
||||
|
||||
|
||||
POS_TAGS = {
|
||||
'NULL': (NO_TAG, {}),
|
||||
'EOL': (EOL, {}),
|
||||
'CC': (CONJ, {}),
|
||||
'CD': (NUM, {}),
|
||||
'DT': (DET, {}),
|
||||
'EX': (DET, {}),
|
||||
'FW': (X, {}),
|
||||
'IN': (ADP, {}),
|
||||
'JJ': (ADJ, {}),
|
||||
'JJR': (ADJ, {'misc': COMPARATIVE}),
|
||||
'JJS': (ADJ, {'misc': SUPERLATIVE}),
|
||||
'LS': (X, {}),
|
||||
'MD': (VERB, {'tenspect': MODAL}),
|
||||
'NN': (NOUN, {}),
|
||||
'NNS': (NOUN, {'number': PLURAL}),
|
||||
'NNP': (NOUN, {'misc': NAME}),
|
||||
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
|
||||
'PDT': (DET, {}),
|
||||
'POS': (PRT, {'case': GENITIVE}),
|
||||
'PRP': (NOUN, {}),
|
||||
'PRP$': (NOUN, {'case': GENITIVE}),
|
||||
'RB': (ADV, {}),
|
||||
'RBR': (ADV, {'misc': COMPARATIVE}),
|
||||
'RBS': (ADV, {'misc': SUPERLATIVE}),
|
||||
'RP': (PRT, {}),
|
||||
'SYM': (X, {}),
|
||||
'TO': (PRT, {}),
|
||||
'UH': (X, {}),
|
||||
'VB': (VERB, {}),
|
||||
'VBD': (VERB, {'tenspect': PAST}),
|
||||
'VBG': (VERB, {'tenspect': ING}),
|
||||
'VBN': (VERB, {'tenspect': PASSIVE}),
|
||||
'VBP': (VERB, {'tenspect': PRESENT}),
|
||||
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||
'WDT': (DET, {'misc': RELATIVE}),
|
||||
'WP': (PRON, {'misc': RELATIVE}),
|
||||
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
|
||||
'WRB': (ADV, {'misc': RELATIVE}),
|
||||
'!': (PUNCT, {}),
|
||||
'#': (PUNCT, {}),
|
||||
'$': (PUNCT, {}),
|
||||
"''": (PUNCT, {}),
|
||||
"(": (PUNCT, {}),
|
||||
")": (PUNCT, {}),
|
||||
"-LRB-": (PUNCT, {}),
|
||||
"-RRB-": (PUNCT, {}),
|
||||
".": (PUNCT, {}),
|
||||
",": (PUNCT, {}),
|
||||
"``": (PUNCT, {}),
|
||||
":": (PUNCT, {}),
|
||||
"?": (PUNCT, {}),
|
||||
}
|
||||
|
||||
|
||||
POS_TEMPLATES = (
|
||||
(W_sic,),
|
||||
(P1_lemma, P1_pos),
|
||||
(P2_lemma, P2_pos),
|
||||
(N1_sic,),
|
||||
(N2_sic,),
|
||||
|
||||
(W_suffix,),
|
||||
(W_prefix,),
|
||||
|
||||
(P1_pos,),
|
||||
(P2_pos,),
|
||||
(P1_pos, P2_pos),
|
||||
(P1_pos, W_sic),
|
||||
(P1_suffix,),
|
||||
(N1_suffix,),
|
||||
|
||||
(W_shape,),
|
||||
(W_cluster,),
|
||||
(N1_cluster,),
|
||||
(N2_cluster,),
|
||||
(P1_cluster,),
|
||||
(P2_cluster,),
|
||||
|
||||
(W_pos_type,),
|
||||
(N1_pos_type,),
|
||||
(N1_pos_type,),
|
||||
(P1_pos, W_pos_type, N1_pos_type),
|
||||
)
|
||||
|
||||
|
||||
cdef class English(Language):
|
||||
|
@ -47,7 +134,68 @@ cdef class English(Language):
|
|||
name (unicode): The two letter code used by Wikipedia for the language.
|
||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||
"""
|
||||
pass
|
||||
def get_props(self, unicode string):
|
||||
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
|
||||
|
||||
def set_flags(self, unicode string):
|
||||
cdef flags_t flags = 0
|
||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||
flags |= orth.is_ascii(string) << IS_ASCII
|
||||
flags |= orth.is_digit(string) << IS_DIGIT
|
||||
flags |= orth.is_lower(string) << IS_LOWER
|
||||
flags |= orth.is_punct(string) << IS_PUNCT
|
||||
flags |= orth.is_space(string) << IS_SPACE
|
||||
flags |= orth.is_title(string) << IS_TITLE
|
||||
flags |= orth.is_upper(string) << IS_UPPER
|
||||
|
||||
flags |= orth.like_url(string) << LIKE_URL
|
||||
flags |= orth.like_number(string) << LIKE_NUMBER
|
||||
return flags
|
||||
|
||||
def set_pos(self, Tokens tokens):
|
||||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
cdef TokenC* t = tokens.data
|
||||
assert self.morphologizer is not None
|
||||
cdef dict tagdict = self.pos_tagger.tagdict
|
||||
for i in range(tokens.length):
|
||||
if t[i].lex.sic in tagdict:
|
||||
t[i].pos = tagdict[t[i].lex.sic]
|
||||
else:
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context)
|
||||
self.morphologizer.set_morph(i, t)
|
||||
|
||||
def train_pos(self, Tokens tokens, golds):
|
||||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
c = 0
|
||||
cdef TokenC* t = tokens.data
|
||||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||
self.morphologizer.set_morph(i, t)
|
||||
c += t[i].pos == golds[i]
|
||||
return c
|
||||
|
||||
|
||||
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
||||
_fill_from_token(&context[W_sic], &tokens[i])
|
||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
||||
|
||||
|
||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||
context[0] = t.lex.sic
|
||||
context[1] = t.lex.cluster
|
||||
context[2] = t.lex.shape
|
||||
context[3] = t.lex.prefix
|
||||
context[4] = t.lex.suffix
|
||||
context[5] = t.pos
|
||||
context[6] = t.lemma
|
||||
context[7] = t.lex.pos_type
|
||||
|
||||
|
||||
EN = English('en')
|
||||
|
|
|
@ -1,38 +1,38 @@
|
|||
from libcpp.vector cimport vector
|
||||
|
||||
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
|
||||
|
||||
from preshed.maps cimport PreshMap
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens
|
||||
from .tokens cimport Tokens, TokenC
|
||||
from .lexeme cimport Lexeme
|
||||
from .tagger cimport Tagger
|
||||
from .ner.greedy_parser cimport NERParser
|
||||
from .utf8string cimport StringStore
|
||||
from .utf8string cimport StringStore, UniStr
|
||||
from .morphology cimport Morphologizer
|
||||
|
||||
|
||||
cdef extern from "Python.h":
|
||||
cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
|
||||
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
|
||||
cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
|
||||
cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
|
||||
cdef union LexemesOrTokens:
|
||||
const Lexeme* const* lexemes
|
||||
TokenC* tokens
|
||||
|
||||
|
||||
cdef struct String:
|
||||
Py_UNICODE* chars
|
||||
size_t n
|
||||
hash_t key
|
||||
cdef struct Cached:
|
||||
LexemesOrTokens data
|
||||
bint is_lex
|
||||
int length
|
||||
|
||||
|
||||
cdef class Lexicon:
|
||||
cpdef public get_lex_props
|
||||
cdef Pool mem
|
||||
cpdef readonly size_t size
|
||||
cpdef readonly StringStore strings
|
||||
cdef vector[Lexeme*] lexemes
|
||||
|
||||
cdef Lexeme* get(self, String* s) except NULL
|
||||
cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
|
||||
|
||||
cdef PreshMap _dict
|
||||
cdef PreshMap _map
|
||||
|
||||
|
||||
cdef class Language:
|
||||
|
@ -41,9 +41,8 @@ cdef class Language:
|
|||
cdef PreshMap _cache
|
||||
cdef PreshMap _specials
|
||||
cpdef readonly Lexicon lexicon
|
||||
|
||||
cpdef readonly Tagger pos_tagger
|
||||
cpdef readonly NERParser ner_tagger
|
||||
cpdef readonly Morphologizer morphologizer
|
||||
|
||||
cdef object _prefix_re
|
||||
cdef object _suffix_re
|
||||
|
@ -52,13 +51,14 @@ cdef class Language:
|
|||
cpdef Tokens tokens_from_list(self, list strings)
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
||||
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except NULL
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
|
||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1
|
||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
|
||||
|
||||
|
|
319
spacy/lang.pyx
319
spacy/lang.pyx
|
@ -18,13 +18,14 @@ from preshed.maps cimport PreshMap
|
|||
from .lexeme cimport Lexeme
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport init as lexeme_init
|
||||
from .lexeme cimport check_flag
|
||||
|
||||
from .utf8string cimport slice_unicode
|
||||
|
||||
from . import util
|
||||
from .util import read_lang_data
|
||||
from .tokens import Tokens
|
||||
|
||||
from .tagger cimport Tagger
|
||||
from .ner.greedy_parser cimport NERParser
|
||||
from .tokens cimport Morphology
|
||||
|
||||
|
||||
cdef class Language:
|
||||
|
@ -37,29 +38,30 @@ cdef class Language:
|
|||
self._prefix_re = re.compile(prefix)
|
||||
self._suffix_re = re.compile(suffix)
|
||||
self._infix_re = re.compile(infix)
|
||||
self.lexicon = Lexicon()
|
||||
if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
|
||||
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
||||
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
||||
self.lexicon = Lexicon(self.get_props)
|
||||
self._load_special_tokenization(rules)
|
||||
if path.exists(path.join(util.DATA_DIR, name, 'pos')):
|
||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
|
||||
else:
|
||||
self.pos_tagger = None
|
||||
if path.exists(path.join(util.DATA_DIR, name, 'ner')):
|
||||
self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
|
||||
self.pos_tagger = None
|
||||
self.morphologizer = None
|
||||
|
||||
def load(self):
|
||||
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
|
||||
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
|
||||
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
||||
self.morphologizer = Morphologizer(self.lexicon.strings,
|
||||
path.join(util.DATA_DIR, self.name))
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings):
|
||||
cdef int length = sum([len(s) for s in strings])
|
||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||
cdef Tokens tokens = Tokens(self, length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef String string_struct
|
||||
cdef UniStr string_struct
|
||||
cdef unicode py_string
|
||||
cdef int idx = 0
|
||||
for i, py_string in enumerate(strings):
|
||||
string_from_unicode(&string_struct, py_string)
|
||||
tokens.push_back(idx, self.lexicon.get(&string_struct))
|
||||
slice_unicode(&string_struct, py_string, 0, len(py_string))
|
||||
tokens.push_back(idx, <const Lexeme*>self.lexicon.get(tokens.mem, &string_struct))
|
||||
idx += len(py_string) + 1
|
||||
return tokens
|
||||
|
||||
|
@ -79,22 +81,21 @@ cdef class Language:
|
|||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
||||
"""
|
||||
cdef int length = len(string)
|
||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||
cdef Tokens tokens = Tokens(self, length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef int i = 0
|
||||
cdef int start = 0
|
||||
cdef bint cache_hit
|
||||
cdef Py_UNICODE* chars = string
|
||||
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
||||
cdef String span
|
||||
cdef UniStr span
|
||||
for i in range(1, length):
|
||||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||
if start < i:
|
||||
string_slice(&span, chars, start, i)
|
||||
lexemes = <Lexeme**>self._cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
else:
|
||||
slice_unicode(&span, chars, start, i)
|
||||
cache_hit = self._try_cache(start, span.key, tokens)
|
||||
if not cache_hit:
|
||||
self._tokenize(tokens, &span, start, i)
|
||||
in_ws = not in_ws
|
||||
start = i
|
||||
|
@ -102,15 +103,27 @@ cdef class Language:
|
|||
start += 1
|
||||
i += 1
|
||||
if start < i:
|
||||
string_slice(&span, chars, start, i)
|
||||
lexemes = <Lexeme**>self._cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
else:
|
||||
slice_unicode(&span, chars, start, i)
|
||||
cache_hit = self._try_cache(start, span.key, tokens)
|
||||
if not cache_hit:
|
||||
self._tokenize(tokens, &span, start, i)
|
||||
return tokens
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
||||
#cached = <Cached*>self._specials.get(key)
|
||||
cached = <Cached*>self._cache.get(key)
|
||||
if cached == NULL:
|
||||
return False
|
||||
cdef int i
|
||||
if cached.is_lex:
|
||||
for i in range(cached.length):
|
||||
idx = tokens.push_back(idx, cached.data.lexemes[i])
|
||||
else:
|
||||
for i in range(cached.length):
|
||||
idx = tokens.push_back(idx, &cached.data.tokens[i])
|
||||
return True
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
||||
cdef vector[Lexeme*] prefixes
|
||||
cdef vector[Lexeme*] suffixes
|
||||
cdef hash_t orig_key
|
||||
|
@ -119,88 +132,95 @@ cdef class Language:
|
|||
orig_size = tokens.length
|
||||
self._split_affixes(span, &prefixes, &suffixes)
|
||||
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
|
||||
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
|
||||
|
||||
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except NULL:
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
|
||||
vector[const Lexeme*] *suffixes) except NULL:
|
||||
cdef size_t i
|
||||
cdef String prefix
|
||||
cdef String suffix
|
||||
cdef String minus_pre
|
||||
cdef String minus_suf
|
||||
cdef UniStr prefix
|
||||
cdef UniStr suffix
|
||||
cdef UniStr minus_pre
|
||||
cdef UniStr minus_suf
|
||||
cdef size_t last_size = 0
|
||||
while string.n != 0 and string.n != last_size:
|
||||
last_size = string.n
|
||||
pre_len = self._find_prefix(string.chars, string.n)
|
||||
if pre_len != 0:
|
||||
string_slice(&prefix, string.chars, 0, pre_len)
|
||||
string_slice(&minus_pre, string.chars, pre_len, string.n)
|
||||
slice_unicode(&prefix, string.chars, 0, pre_len)
|
||||
slice_unicode(&minus_pre, string.chars, pre_len, string.n)
|
||||
# Check whether we've hit a special-case
|
||||
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
|
||||
string[0] = minus_pre
|
||||
prefixes.push_back(self.lexicon.get(&prefix))
|
||||
prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
|
||||
break
|
||||
suf_len = self._find_suffix(string.chars, string.n)
|
||||
if suf_len != 0:
|
||||
string_slice(&suffix, string.chars, string.n - suf_len, string.n)
|
||||
string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||
slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
|
||||
slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||
# Check whether we've hit a special-case
|
||||
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
|
||||
string[0] = minus_suf
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
|
||||
break
|
||||
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
||||
string_slice(string, string.chars, pre_len, string.n - suf_len)
|
||||
prefixes.push_back(self.lexicon.get(&prefix))
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
slice_unicode(string, string.chars, pre_len, string.n - suf_len)
|
||||
prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
|
||||
suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
|
||||
elif pre_len:
|
||||
string[0] = minus_pre
|
||||
prefixes.push_back(self.lexicon.get(&prefix))
|
||||
prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
|
||||
elif suf_len:
|
||||
string[0] = minus_suf
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
|
||||
if self._specials.get(string.key):
|
||||
break
|
||||
return string
|
||||
|
||||
cdef int _attach_tokens(self, Tokens tokens,
|
||||
int idx, String* string,
|
||||
vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except -1:
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||
vector[const Lexeme*] *prefixes,
|
||||
vector[const Lexeme*] *suffixes) except -1:
|
||||
cdef bint cache_hit
|
||||
cdef int split
|
||||
cdef Lexeme** lexemes
|
||||
cdef const Lexeme* const* lexemes
|
||||
cdef Lexeme* lexeme
|
||||
cdef String span
|
||||
cdef UniStr span
|
||||
cdef int i
|
||||
if prefixes.size():
|
||||
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
||||
for i in range(prefixes.size()):
|
||||
idx = tokens.push_back(idx, prefixes[0][i])
|
||||
if string.n != 0:
|
||||
|
||||
lexemes = <Lexeme**>self._cache.get(string.key)
|
||||
if lexemes != NULL:
|
||||
idx = tokens.extend(idx, lexemes, 0)
|
||||
cache_hit = self._try_cache(idx, string.key, tokens)
|
||||
if cache_hit:
|
||||
idx = tokens.data[tokens.length - 1].idx + 1
|
||||
else:
|
||||
split = self._find_infix(string.chars, string.n)
|
||||
if split == 0 or split == -1:
|
||||
idx = tokens.push_back(idx, self.lexicon.get(string))
|
||||
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, string))
|
||||
else:
|
||||
string_slice(&span, string.chars, 0, split)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
string_slice(&span, string.chars, split, split+1)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
string_slice(&span, string.chars, split + 1, string.n)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
||||
cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
|
||||
slice_unicode(&span, string.chars, 0, split)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
|
||||
slice_unicode(&span, string.chars, split, split+1)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
|
||||
slice_unicode(&span, string.chars, split + 1, string.n)
|
||||
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
|
||||
cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
|
||||
while it != suffixes.rend():
|
||||
idx = tokens.push_back(idx, deref(it))
|
||||
preinc(it)
|
||||
|
||||
cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
|
||||
lexemes = <Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
|
||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
|
||||
cdef int i
|
||||
for i in range(n):
|
||||
lexemes[i] = tokens[i]
|
||||
lexemes[i + 1] = NULL
|
||||
self._cache.set(key, lexemes)
|
||||
if tokens[i].lex.id == 1:
|
||||
return 0
|
||||
cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
|
||||
cached.length = n
|
||||
cached.is_lex = True
|
||||
lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
|
||||
for i in range(n):
|
||||
lexemes[i] = tokens[i].lex
|
||||
cached.data.lexemes = <const Lexeme* const*>lexemes
|
||||
self._cache.set(key, cached)
|
||||
|
||||
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||
cdef unicode string = chars[:length]
|
||||
|
@ -217,66 +237,120 @@ cdef class Language:
|
|||
match = self._suffix_re.search(string)
|
||||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def _load_special_tokenization(self, token_rules):
|
||||
'''Load special-case tokenization rules.
|
||||
|
||||
Loads special-case tokenization rules into the Language._cache cache,
|
||||
read from data/<lang>/tokenization . The special cases are loaded before
|
||||
any language data is tokenized, giving these priority. For instance,
|
||||
the English tokenization rules map "ain't" to ["are", "not"].
|
||||
|
||||
Args:
|
||||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||
a string and tokens is a list of strings.
|
||||
def _load_special_tokenization(self, object rules):
|
||||
'''Add a special-case tokenization rule.
|
||||
'''
|
||||
cdef int i
|
||||
cdef unicode chunk
|
||||
cdef list substrings
|
||||
cdef unicode form
|
||||
cdef unicode lemma
|
||||
cdef dict props
|
||||
cdef Lexeme** lexemes
|
||||
cdef hash_t hashed
|
||||
cdef String string
|
||||
for uni_string, substrings in token_rules:
|
||||
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
|
||||
for i, substring in enumerate(substrings):
|
||||
string_from_unicode(&string, substring)
|
||||
lexemes[i] = <Lexeme*>self.lexicon.get(&string)
|
||||
lexemes[i + 1] = NULL
|
||||
string_from_unicode(&string, uni_string)
|
||||
self._specials.set(string.key, lexemes)
|
||||
self._cache.set(string.key, lexemes)
|
||||
cdef UniStr string
|
||||
for chunk, substrings in sorted(rules.items()):
|
||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||
for i, props in enumerate(substrings):
|
||||
form = props['F']
|
||||
lemma = props.get("L", None)
|
||||
slice_unicode(&string, form, 0, len(form))
|
||||
tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
|
||||
if lemma:
|
||||
tokens[i].lemma = self.lexicon.strings[lemma]
|
||||
set_morph_from_dict(&tokens[i].morph, props)
|
||||
cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
|
||||
cached.length = len(substrings)
|
||||
cached.is_lex = False
|
||||
cached.data.tokens = tokens
|
||||
slice_unicode(&string, chunk, 0, len(chunk))
|
||||
self._specials.set(string.key, cached)
|
||||
self._cache.set(string.key, cached)
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||
morph.number = props.get('number', 0)
|
||||
morph.tenspect = props.get('tenspect', 0)
|
||||
morph.mood = props.get('mood', 0)
|
||||
morph.gender = props.get('gender', 0)
|
||||
morph.person = props.get('person', 0)
|
||||
morph.case = props.get('case', 0)
|
||||
morph.misc = props.get('misc', 0)
|
||||
|
||||
|
||||
cdef class Lexicon:
|
||||
def __init__(self):
|
||||
'''A map container for a language's Lexeme structs.
|
||||
|
||||
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
|
||||
'''
|
||||
def __init__(self, object get_props):
|
||||
self.mem = Pool()
|
||||
self._dict = PreshMap(2 ** 20)
|
||||
self._map = PreshMap(2 ** 20)
|
||||
self.strings = StringStore()
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.size = 1
|
||||
self.get_lex_props = get_props
|
||||
|
||||
cdef Lexeme* get(self, String* string) except NULL:
|
||||
def __len__(self):
|
||||
return self.lexemes.size()
|
||||
|
||||
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||
cdef Lexeme* lex
|
||||
lex = <Lexeme*>self._dict.get(string.key)
|
||||
lex = <Lexeme*>self._map.get(string.key)
|
||||
if lex != NULL:
|
||||
return lex
|
||||
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
||||
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
|
||||
self._dict.set(string.key, lex)
|
||||
while self.lexemes.size() < (lex.id + 1):
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.lexemes[lex.id] = lex
|
||||
self.size += 1
|
||||
if string.n < 3:
|
||||
mem = self.mem
|
||||
cdef unicode py_string = string.chars[:string.n]
|
||||
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
||||
lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
|
||||
self.get_lex_props(py_string))
|
||||
if mem is self.mem:
|
||||
self._map.set(string.key, lex)
|
||||
while self.lexemes.size() < (lex.id + 1):
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.lexemes[lex.id] = lex
|
||||
else:
|
||||
lex[0].id = 1
|
||||
return lex
|
||||
|
||||
def __getitem__(self, id_or_string):
|
||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
unseen unicode string is given, a new Lexeme is created and stored.
|
||||
|
||||
This function relies on Cython's struct-to-dict conversion. Python clients
|
||||
receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
|
||||
with int values. Cython clients can instead receive a Lexeme struct value.
|
||||
More efficient Cython access is provided by Lexicon.get, which returns
|
||||
a Lexeme*.
|
||||
|
||||
Args:
|
||||
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
||||
string. If an int >= Lexicon.size, IndexError is raised.
|
||||
If id_or_string is neither an int nor a unicode string, ValueError
|
||||
is raised.
|
||||
|
||||
Returns:
|
||||
lexeme (dict): A Lexeme struct instance, which Cython translates into
|
||||
a dict if the operator is called from Python.
|
||||
'''
|
||||
if type(id_or_string) == int:
|
||||
if id_or_string >= self.lexemes.size():
|
||||
raise IndexError
|
||||
return self.lexemes.at(id_or_string)[0]
|
||||
cdef String string
|
||||
string_from_unicode(&string, id_or_string)
|
||||
cdef Lexeme* lexeme = self.get(&string)
|
||||
cdef UniStr string
|
||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||
cdef const Lexeme* lexeme = self.get(self.mem, &string)
|
||||
return lexeme[0]
|
||||
|
||||
def __setitem__(self, unicode uni_string, dict props):
|
||||
cdef String s
|
||||
string_from_unicode(&s, uni_string)
|
||||
cdef Lexeme* lex = self.get(&s)
|
||||
cdef UniStr s
|
||||
slice_unicode(&s, uni_string, 0, len(uni_string))
|
||||
# Cast through the const here, since we're allowed to change our own
|
||||
# Lexemes.
|
||||
lex = <Lexeme*><void*>self.get(self.mem, &s)
|
||||
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
||||
|
||||
def dump(self, loc):
|
||||
|
@ -287,11 +361,11 @@ cdef class Lexicon:
|
|||
assert fp != NULL
|
||||
cdef size_t st
|
||||
cdef hash_t key
|
||||
for i in range(self._dict.length):
|
||||
key = self._dict.c_map.cells[i].key
|
||||
for i in range(self._map.length):
|
||||
key = self._map.c_map.cells[i].key
|
||||
if key == 0:
|
||||
continue
|
||||
lexeme = <Lexeme*>self._dict.c_map.cells[i].value
|
||||
lexeme = <Lexeme*>self._map.c_map.cells[i].value
|
||||
st = fwrite(&key, sizeof(key), 1, fp)
|
||||
assert st == 1
|
||||
st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
|
||||
|
@ -300,7 +374,8 @@ cdef class Lexicon:
|
|||
assert st == 0
|
||||
|
||||
def load(self, loc):
|
||||
assert path.exists(loc)
|
||||
if not path.exists(loc):
|
||||
raise IOError('Lexemes file not found at %s' % loc)
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
|
||||
assert fp != NULL
|
||||
|
@ -316,21 +391,9 @@ cdef class Lexicon:
|
|||
st = fread(lexeme, sizeof(Lexeme), 1, fp)
|
||||
if st != 1:
|
||||
break
|
||||
self._dict.set(key, lexeme)
|
||||
self._map.set(key, lexeme)
|
||||
while self.lexemes.size() < (lexeme.id + 1):
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.lexemes[lexeme.id] = lexeme
|
||||
i += 1
|
||||
self.size += 1
|
||||
fclose(fp)
|
||||
|
||||
|
||||
cdef void string_from_unicode(String* s, unicode uni):
|
||||
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
|
||||
string_slice(s, c_uni, 0, len(uni))
|
||||
|
||||
|
||||
cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||
s.chars = &chars[start]
|
||||
s.n = end - start
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
|
|
90
spacy/lemmatizer.py
Normal file
90
spacy/lemmatizer.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
from os import path
|
||||
|
||||
|
||||
NOUN_RULES = (
|
||||
('s', ''),
|
||||
('ses', 's'),
|
||||
('ves', 'f'),
|
||||
('xes', 'x'),
|
||||
('zes', 'z'),
|
||||
('ches', 'ch'),
|
||||
('shes', 'sh'),
|
||||
('men', 'man'),
|
||||
('ies', 'y')
|
||||
)
|
||||
|
||||
|
||||
VERB_RULES = (
|
||||
("s", ""),
|
||||
("ies", "y"),
|
||||
("es", "e"),
|
||||
("es", ""),
|
||||
("ed", "e"),
|
||||
("ed", ""),
|
||||
("ing", "e"),
|
||||
("ing", "")
|
||||
)
|
||||
|
||||
|
||||
ADJ_RULES = (
|
||||
("er", ""),
|
||||
("est", ""),
|
||||
("er", "e"),
|
||||
("est", "e")
|
||||
)
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
def __init__(self, wn_dict_dir):
|
||||
self.index = {}
|
||||
self.exc = {}
|
||||
for pos in ['adj', 'adv', 'noun', 'verb']:
|
||||
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
|
||||
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
|
||||
|
||||
def noun(self, string):
|
||||
return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
|
||||
|
||||
def verb(self, string):
|
||||
return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
|
||||
|
||||
def adj(self, string):
|
||||
return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
forms.extend(exceptions.get(string, []))
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[:len(string) - len(old)] + new
|
||||
if form in index:
|
||||
forms.append(form)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return set(forms)
|
||||
|
||||
|
||||
def read_index(loc):
|
||||
index = set()
|
||||
for line in open(loc):
|
||||
if line.startswith(' '):
|
||||
continue
|
||||
pieces = line.split()
|
||||
word = pieces[0]
|
||||
if word.count('_') == 0:
|
||||
index.add(word)
|
||||
return index
|
||||
|
||||
|
||||
def read_exc(loc):
|
||||
exceptions = {}
|
||||
for line in open(loc):
|
||||
if line.startswith(' '):
|
||||
continue
|
||||
pieces = line.split()
|
||||
exceptions[pieces[0]] = tuple(pieces[1:])
|
||||
return exceptions
|
160
spacy/lexeme.pxd
160
spacy/lexeme.pxd
|
@ -1,61 +1,137 @@
|
|||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
|
||||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
|
||||
|
||||
from .utf8string cimport StringStore
|
||||
from libc.stdint cimport uint16_t
|
||||
|
||||
cpdef flag_t OOV_DIST_FLAGS
|
||||
|
||||
# Flags
|
||||
cpdef enum:
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
IS_LOWER
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_TITLE
|
||||
IS_UPPER
|
||||
# Reserve 64 values for flag features
|
||||
cpdef enum attr_id_t:
|
||||
FLAG0
|
||||
FLAG1
|
||||
FLAG2
|
||||
FLAG3
|
||||
FLAG4
|
||||
FLAG5
|
||||
FLAG6
|
||||
FLAG7
|
||||
FLAG8
|
||||
FLAG9
|
||||
FLAG10
|
||||
FLAG11
|
||||
FLAG12
|
||||
FLAG13
|
||||
FLAG14
|
||||
FLAG15
|
||||
FLAG16
|
||||
FLAG17
|
||||
FLAG18
|
||||
FLAG19
|
||||
FLAG20
|
||||
FLAG21
|
||||
FLAG22
|
||||
FLAG23
|
||||
FLAG24
|
||||
FLAG25
|
||||
FLAG26
|
||||
FLAG27
|
||||
FLAG28
|
||||
FLAG29
|
||||
FLAG30
|
||||
FLAG31
|
||||
FLAG32
|
||||
FLAG33
|
||||
FLAG34
|
||||
FLAG35
|
||||
FLAG36
|
||||
FLAG37
|
||||
FLAG38
|
||||
FLAG39
|
||||
FLAG40
|
||||
FLAG41
|
||||
FLAG42
|
||||
FLAG43
|
||||
FLAG44
|
||||
FLAG45
|
||||
FLAG46
|
||||
FLAG47
|
||||
FLAG48
|
||||
FLAG49
|
||||
FLAG50
|
||||
FLAG51
|
||||
FLAG52
|
||||
FLAG53
|
||||
FLAG54
|
||||
FLAG55
|
||||
FLAG56
|
||||
FLAG57
|
||||
FLAG58
|
||||
FLAG59
|
||||
FLAG60
|
||||
FLAG61
|
||||
FLAG62
|
||||
FLAG63
|
||||
|
||||
LIKE_URL
|
||||
LIKE_NUMBER
|
||||
ID
|
||||
SIC
|
||||
DENSE
|
||||
SHAPE
|
||||
PREFIX
|
||||
SUFFIX
|
||||
|
||||
OFT_LOWER
|
||||
OFT_TITLE
|
||||
OFT_UPPER
|
||||
|
||||
IN_MALES
|
||||
IN_FEMALES
|
||||
IN_SURNAMES
|
||||
IN_PLACES
|
||||
IN_GAMES
|
||||
IN_CELEBS
|
||||
IN_NAMES
|
||||
LENGTH
|
||||
CLUSTER
|
||||
POS_TYPE
|
||||
LEMMA
|
||||
|
||||
|
||||
cdef struct Lexeme:
|
||||
flag_t flags
|
||||
flags_t flags
|
||||
|
||||
id_t id
|
||||
id_t sic
|
||||
id_t norm
|
||||
id_t shape
|
||||
id_t asciied
|
||||
id_t prefix
|
||||
id_t suffix
|
||||
attr_t id
|
||||
attr_t sic
|
||||
attr_t dense
|
||||
attr_t shape
|
||||
attr_t prefix
|
||||
attr_t suffix
|
||||
|
||||
attr_t length
|
||||
attr_t cluster
|
||||
attr_t pos_type
|
||||
|
||||
float prob
|
||||
|
||||
len_t length
|
||||
tag_t cluster
|
||||
tag_t postype
|
||||
tag_t supersense
|
||||
float sentiment
|
||||
|
||||
|
||||
cdef Lexeme EMPTY_LEXEME
|
||||
|
||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
||||
StringStore store, dict props) except *
|
||||
|
||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
|
||||
dict props) except *
|
||||
|
||||
|
||||
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
|
||||
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
||||
|
||||
cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
||||
if feat_name < (sizeof(flags_t) * 8):
|
||||
return check_flag(lex, feat_name)
|
||||
elif feat_name == ID:
|
||||
return lex.id
|
||||
elif feat_name == SIC:
|
||||
return lex.sic
|
||||
elif feat_name == DENSE:
|
||||
return lex.dense
|
||||
elif feat_name == SHAPE:
|
||||
return lex.shape
|
||||
elif feat_name == PREFIX:
|
||||
return lex.prefix
|
||||
elif feat_name == SUFFIX:
|
||||
return lex.suffix
|
||||
elif feat_name == LENGTH:
|
||||
return lex.length
|
||||
elif feat_name == CLUSTER:
|
||||
return lex.cluster
|
||||
elif feat_name == POS_TYPE:
|
||||
return lex.pos_type
|
||||
else:
|
||||
return 0
|
||||
|
|
|
@ -6,67 +6,25 @@ from libc.string cimport memset
|
|||
|
||||
import orth
|
||||
|
||||
from .utf8string cimport Utf8Str
|
||||
|
||||
OOV_DIST_FLAGS = 0
|
||||
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
||||
|
||||
|
||||
def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
|
||||
cdef flag_t flags = 0
|
||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||
flags |= orth.is_ascii(string) << IS_ASCII
|
||||
flags |= orth.is_digit(string) << IS_DIGIT
|
||||
flags |= orth.is_lower(string) << IS_LOWER
|
||||
flags |= orth.is_punct(string) << IS_PUNCT
|
||||
flags |= orth.is_space(string) << IS_SPACE
|
||||
flags |= orth.is_title(string) << IS_TITLE
|
||||
flags |= orth.is_upper(string) << IS_UPPER
|
||||
|
||||
flags |= orth.like_url(string) << LIKE_URL
|
||||
flags |= orth.like_number(string) << LIKE_NUMBER
|
||||
return flags
|
||||
|
||||
|
||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
||||
StringStore store, dict props) except *:
|
||||
StringStore string_store, dict props) except *:
|
||||
cdef Lexeme lex
|
||||
lex.id = i
|
||||
lex.length = len(string)
|
||||
lex.sic = get_string_id(string, store)
|
||||
lex.sic = string_store[string]
|
||||
|
||||
lex.cluster = props.get('cluster', 0)
|
||||
lex.postype = props.get('postype', 0)
|
||||
lex.supersense = props.get('supersense', 0)
|
||||
lex.pos_type = props.get('pos_type', 0)
|
||||
lex.prob = props.get('prob', 0)
|
||||
|
||||
cdef float upper_pc = props.get('upper_pc', 0.0)
|
||||
cdef float lower_pc = props.get('lower_pc', 0.0)
|
||||
cdef float title_pc = props.get('title_pc', 0.0)
|
||||
|
||||
lex.prefix = get_string_id(string[0], store)
|
||||
lex.suffix = get_string_id(string[-3:], store)
|
||||
if upper_pc or lower_pc or title_pc:
|
||||
canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
|
||||
lex.norm = get_string_id(canon_cased, store)
|
||||
else:
|
||||
lex.norm = lex.sic
|
||||
lex.shape = get_string_id(orth.word_shape(string), store)
|
||||
lex.asciied = get_string_id(orth.asciied(string), store)
|
||||
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
|
||||
|
||||
lex.flags |= props.get('in_males', 0) << IN_MALES
|
||||
lex.flags |= props.get('in_females', 0) << IN_FEMALES
|
||||
lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
|
||||
lex.flags |= props.get('in_places', 0) << IN_PLACES
|
||||
lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
|
||||
lex.flags |= props.get('in_games', 0) << IN_GAMES
|
||||
lex.flags |= props.get('in_names', 0) << IN_NAMES
|
||||
lex.prefix = string_store[string[:1]]
|
||||
lex.suffix = string_store[string[-3:]]
|
||||
lex.shape = string_store[orth.word_shape(string)]
|
||||
lex.dense = string_store[props['dense']]
|
||||
|
||||
lex.flags = props.get('flags', 0)
|
||||
return lex
|
||||
|
||||
|
||||
cdef id_t get_string_id(unicode string, StringStore store) except 0:
|
||||
cdef bytes byte_string = string.encode('utf8')
|
||||
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
|
||||
return orig_str.i
|
||||
|
|
45
spacy/morphology.pxd
Normal file
45
spacy/morphology.pxd
Normal file
|
@ -0,0 +1,45 @@
|
|||
|
||||
from .tokens cimport TokenC
|
||||
from .lexeme cimport Lexeme
|
||||
from .utf8string cimport StringStore
|
||||
from .typedefs cimport id_t, Morphology
|
||||
|
||||
from preshed.maps cimport PreshMapArray
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
|
||||
# Google universal tag set
|
||||
cpdef enum univ_tag_t:
|
||||
NO_TAG
|
||||
ADJ
|
||||
ADV
|
||||
ADP
|
||||
CONJ
|
||||
DET
|
||||
NOUN
|
||||
NUM
|
||||
PRON
|
||||
PRT
|
||||
VERB
|
||||
X
|
||||
PUNCT
|
||||
EOL
|
||||
N_UNIV_TAGS
|
||||
|
||||
|
||||
cdef struct PosTag:
|
||||
Morphology morph
|
||||
int id
|
||||
univ_tag_t pos
|
||||
|
||||
|
||||
cdef class Morphologizer:
|
||||
cdef Pool mem
|
||||
cdef StringStore strings
|
||||
cdef object lemmatizer
|
||||
cdef PosTag* tags
|
||||
cdef readonly list tag_names
|
||||
|
||||
cdef PreshMapArray _cache
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
117
spacy/morphology.pyx
Normal file
117
spacy/morphology.pyx
Normal file
|
@ -0,0 +1,117 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
from os import path
|
||||
import json
|
||||
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .typedefs cimport id_t
|
||||
|
||||
UNIV_TAGS = {
|
||||
'NULL': NO_TAG,
|
||||
'ADJ': ADJ,
|
||||
'ADV': ADV,
|
||||
'ADP': ADP,
|
||||
'CONJ': CONJ,
|
||||
'DET': DET,
|
||||
'NOUN': NOUN,
|
||||
'NUM': NUM,
|
||||
'PRON': PRON,
|
||||
'PRT': PRT,
|
||||
'VERB': VERB,
|
||||
'X': X,
|
||||
'.': PUNCT,
|
||||
'EOL': EOL
|
||||
}
|
||||
|
||||
|
||||
cdef struct _Cached:
|
||||
Morphology morph
|
||||
int lemma
|
||||
|
||||
|
||||
cdef class Morphologizer:
|
||||
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||
"""
|
||||
def __init__(self, StringStore strings, data_dir):
|
||||
self.mem = Pool()
|
||||
self.strings = strings
|
||||
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||
tag_map = cfg['tag_map']
|
||||
self.tag_names = cfg['tag_names']
|
||||
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
|
||||
self._cache = PreshMapArray(len(self.tag_names))
|
||||
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
||||
for i, tag in enumerate(self.tag_names):
|
||||
pos, props = tag_map[tag]
|
||||
self.tags[i].id = i
|
||||
self.tags[i].pos = pos
|
||||
self.tags[i].morph.number = props.get('number', 0)
|
||||
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
||||
self.tags[i].morph.mood = props.get('mood', 0)
|
||||
self.tags[i].morph.gender = props.get('gender', 0)
|
||||
self.tags[i].morph.person = props.get('person', 0)
|
||||
self.tags[i].morph.case = props.get('case', 0)
|
||||
self.tags[i].morph.misc = props.get('misc', 0)
|
||||
if path.exists(path.join(data_dir, 'morphs.json')):
|
||||
with open(path.join(data_dir, 'morphs.json')) as file_:
|
||||
self.load_exceptions(json.load(file_))
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return lex.sic
|
||||
cdef bytes py_string = self.strings[lex.sic]
|
||||
cdef set lemma_strings
|
||||
cdef bytes lemma_string
|
||||
if pos == NOUN:
|
||||
lemma_strings = self.lemmatizer.noun(py_string)
|
||||
elif pos == VERB:
|
||||
lemma_strings = self.lemmatizer.verb(py_string)
|
||||
else:
|
||||
assert pos == ADJ
|
||||
lemma_strings = self.lemmatizer.adj(py_string)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
||||
return lemma
|
||||
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
||||
cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
|
||||
if cached is NULL:
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||
cached.morph = tag.morph
|
||||
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
||||
|
||||
tokens[i].lemma = cached.lemma
|
||||
tokens[i].morph = cached.morph
|
||||
|
||||
def load_exceptions(self, dict exc):
|
||||
cdef unicode pos_str
|
||||
cdef unicode form_str
|
||||
cdef unicode lemma_str
|
||||
cdef dict entries
|
||||
cdef dict props
|
||||
cdef int lemma
|
||||
cdef id_t sic
|
||||
cdef univ_tag_t pos
|
||||
for pos_str, entries in exc.items():
|
||||
pos = self.tag_names.index(pos_str)
|
||||
for form_str, props in entries.items():
|
||||
lemma_str = props.get('L', form_str)
|
||||
sic = self.strings[form_str]
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.lemma = self.strings[lemma_str]
|
||||
set_morph_from_dict(&cached.morph, props)
|
||||
self._cache.set(pos, sic, <void*>cached)
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||
morph.number = props.get('number', 0)
|
||||
morph.tenspect = props.get('tenspect', 0)
|
||||
morph.mood = props.get('mood', 0)
|
||||
morph.gender = props.get('gender', 0)
|
||||
morph.person = props.get('person', 0)
|
||||
morph.case = props.get('case', 0)
|
||||
morph.misc = props.get('misc', 0)
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
import unicodedata
|
||||
from unidecode import unidecode
|
||||
import re
|
||||
|
||||
import math
|
||||
|
||||
|
|
|
@ -147,6 +147,7 @@ Y PRT
|
|||
Z NOUN
|
||||
^ NOUN
|
||||
~ X
|
||||
`` .""".strip().split('\n'))
|
||||
`` .
|
||||
EOL EOL""".strip().split('\n'))
|
||||
return mapping[tag]
|
||||
|
||||
|
|
|
@ -1,34 +1,23 @@
|
|||
from libc.stdint cimport uint8_t
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from thinc.learner cimport LinearModel
|
||||
from thinc.features cimport Extractor
|
||||
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .context cimport Slots
|
||||
from preshed.maps cimport PreshMapArray
|
||||
|
||||
from .typedefs cimport hash_t, id_t
|
||||
from .tokens cimport Tokens
|
||||
|
||||
|
||||
cpdef enum TagType:
|
||||
POS
|
||||
ENTITY
|
||||
SENSE
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
cpdef int set_tags(self, Tokens tokens) except -1
|
||||
cpdef class_t predict(self, int i, Tokens tokens) except 0
|
||||
cpdef int tell_answer(self, list gold) except -1
|
||||
cdef class_t predict(self, const atom_t* context, object golds=*) except *
|
||||
|
||||
cpdef readonly Pool mem
|
||||
cpdef readonly Extractor extractor
|
||||
cpdef readonly LinearModel model
|
||||
|
||||
cpdef readonly TagType tag_type
|
||||
cpdef readonly list tag_names
|
||||
|
||||
cdef class_t _guess
|
||||
cdef atom_t* _context
|
||||
cdef feat_t* _feats
|
||||
cdef weight_t* _values
|
||||
cdef weight_t* _scores
|
||||
cdef dict tagdict
|
||||
|
|
185
spacy/tagger.pyx
185
spacy/tagger.pyx
|
@ -1,5 +1,4 @@
|
|||
# cython: profile=True
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
|
||||
|
@ -10,155 +9,59 @@ import random
|
|||
import json
|
||||
import cython
|
||||
|
||||
|
||||
from .context cimport fill_context
|
||||
from .context cimport N_FIELDS
|
||||
|
||||
from thinc.features cimport ConjFeat
|
||||
from thinc.features cimport Feature, count_feats
|
||||
|
||||
|
||||
NULL_TAG = 0
|
||||
|
||||
|
||||
def setup_model_dir(tag_type, tag_names, templates, model_dir):
|
||||
def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
|
||||
if path.exists(model_dir):
|
||||
shutil.rmtree(model_dir)
|
||||
os.mkdir(model_dir)
|
||||
config = {
|
||||
'tag_type': tag_type,
|
||||
'templates': templates,
|
||||
'tag_names': tag_names,
|
||||
'tag_map': tag_map,
|
||||
'tag_counts': tag_counts,
|
||||
}
|
||||
with open(path.join(model_dir, 'config.json'), 'w') as file_:
|
||||
json.dump(config, file_)
|
||||
|
||||
|
||||
def train(train_sents, model_dir, nr_iter=10):
|
||||
cdef Tokens tokens
|
||||
tagger = Tagger(model_dir)
|
||||
for _ in range(nr_iter):
|
||||
n_corr = 0
|
||||
total = 0
|
||||
for tokens, golds in train_sents:
|
||||
assert len(tokens) == len(golds), [t.string for t in tokens]
|
||||
for i in range(tokens.length):
|
||||
if tagger.tag_type == POS:
|
||||
gold = _get_gold_pos(i, golds, tokens.pos)
|
||||
elif tagger.tag_type == ENTITY:
|
||||
gold = _get_gold_ner(i, golds, tokens.ner)
|
||||
guess = tagger.predict(i, tokens)
|
||||
tokens.set_tag(i, tagger.tag_type, guess)
|
||||
if gold is not None:
|
||||
tagger.tell_answer(gold)
|
||||
total += 1
|
||||
n_corr += guess in gold
|
||||
#print('%s\t%d\t%d' % (tokens[i].string, guess, gold))
|
||||
print('%.4f' % ((n_corr / total) * 100))
|
||||
random.shuffle(train_sents)
|
||||
tagger.model.end_training()
|
||||
tagger.model.dump(path.join(model_dir, 'model'))
|
||||
|
||||
|
||||
cdef object _get_gold_pos(i, golds, int* pred):
|
||||
if golds[i] == 0:
|
||||
return None
|
||||
else:
|
||||
return [golds[i]]
|
||||
|
||||
|
||||
cdef object _get_gold_ner(i, golds, int* ner):
|
||||
if golds[i] == 0:
|
||||
return None
|
||||
else:
|
||||
return [golds[i]]
|
||||
|
||||
|
||||
def evaluate(tagger, sents):
|
||||
n_corr = 0
|
||||
total = 0
|
||||
for tokens, golds in sents:
|
||||
for i, gold in enumerate(golds):
|
||||
guess = tagger.predict(i, tokens)
|
||||
tokens.set_tag(i, tagger.tag_type, guess)
|
||||
if gold != NULL_TAG:
|
||||
total += 1
|
||||
n_corr += guess == gold
|
||||
return n_corr / total
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
"""Assign part-of-speech, named entity or supersense tags, using greedy
|
||||
decoding. The tagger reads its model and configuration from disk.
|
||||
"""Predict some type of tag, using greedy decoding. The tagger reads its
|
||||
model and configuration from disk.
|
||||
"""
|
||||
def __init__(self, model_dir):
|
||||
self.mem = Pool()
|
||||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
||||
templates = cfg['templates']
|
||||
univ_counts = {}
|
||||
cdef unicode tag
|
||||
cdef unicode univ_tag
|
||||
self.tag_names = cfg['tag_names']
|
||||
self.tag_type = cfg['tag_type']
|
||||
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
|
||||
self.model = LinearModel(len(self.tag_names))
|
||||
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
||||
self.extractor = Extractor(templates)
|
||||
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
||||
if path.exists(path.join(model_dir, 'model')):
|
||||
self.model.load(path.join(model_dir, 'model'))
|
||||
|
||||
self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
|
||||
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
|
||||
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
|
||||
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
|
||||
self._guess = NULL_TAG
|
||||
|
||||
cpdef int set_tags(self, Tokens tokens) except -1:
|
||||
"""Assign tags to a Tokens object.
|
||||
|
||||
>>> tokens = EN.tokenize(u'An example sentence.')
|
||||
>>> assert tokens[0].pos == 'NO_TAG'
|
||||
>>> EN.pos_tagger.set_tags(tokens)
|
||||
>>> assert tokens[0].pos == 'DT'
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(tokens.length):
|
||||
tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
|
||||
|
||||
cpdef class_t predict(self, int i, Tokens tokens) except 0:
|
||||
"""Predict the tag of tokens[i]. The tagger remembers the features and
|
||||
prediction, in case you later call tell_answer.
|
||||
cdef class_t predict(self, atom_t* context, object golds=None) except *:
|
||||
"""Predict the tag of tokens[i].
|
||||
|
||||
>>> tokens = EN.tokenize(u'An example sentence.')
|
||||
>>> tag = EN.pos_tagger.predict(0, tokens)
|
||||
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
|
||||
"""
|
||||
fill_context(self._context, i, tokens)
|
||||
self.extractor.extract(self._feats, self._values, self._context, NULL)
|
||||
self._guess = self.model.score(self._scores, self._feats, self._values)
|
||||
return self._guess
|
||||
|
||||
cpdef int tell_answer(self, list golds) except -1:
|
||||
"""Provide the correct tag for the word the tagger was last asked to predict.
|
||||
During Tagger.predict, the tagger remembers the features and prediction
|
||||
for the example. These are used to calculate a weight update given the
|
||||
correct label.
|
||||
|
||||
>>> tokens = EN.tokenize('An example sentence.')
|
||||
>>> guess = EN.pos_tagger.predict(1, tokens)
|
||||
>>> JJ = EN.pos_tagger.tag_id('JJ')
|
||||
>>> JJ
|
||||
7
|
||||
>>> EN.pos_tagger.tell_answer(JJ)
|
||||
"""
|
||||
cdef class_t guess = self._guess
|
||||
if guess in golds:
|
||||
self.model.update({})
|
||||
return 0
|
||||
best_gold = golds[0]
|
||||
best_score = self._scores[best_gold-1]
|
||||
for gold in golds[1:]:
|
||||
if self._scores[gold-1] > best_gold:
|
||||
best_score = self._scores[best_gold-1]
|
||||
best_gold = gold
|
||||
counts = {guess: {}, best_gold: {}}
|
||||
self.extractor.count(counts[best_gold], self._feats, 1)
|
||||
self.extractor.count(counts[guess], self._feats, -1)
|
||||
self.model.update(counts)
|
||||
cdef int n_feats
|
||||
cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
|
||||
cdef weight_t* scores = self.model.get_scores(feats, n_feats)
|
||||
guess = _arg_max(scores, self.model.nr_class)
|
||||
if golds is not None and guess not in golds:
|
||||
best = _arg_max_among(scores, golds)
|
||||
counts = {guess: {}, best: {}}
|
||||
count_feats(counts[guess], feats, n_feats, -1)
|
||||
count_feats(counts[best], feats, n_feats, 1)
|
||||
self.model.update(counts)
|
||||
return guess
|
||||
|
||||
def tag_id(self, object tag_name):
|
||||
"""Encode tag_name into a tag ID integer."""
|
||||
|
@ -167,3 +70,41 @@ cdef class Tagger:
|
|||
tag_id = len(self.tag_names)
|
||||
self.tag_names.append(tag_name)
|
||||
return tag_id
|
||||
|
||||
|
||||
def _make_tag_dict(counts):
|
||||
freq_thresh = 20
|
||||
ambiguity_thresh = 0.97
|
||||
tagdict = {}
|
||||
cdef atom_t word
|
||||
cdef atom_t tag
|
||||
for word_str, tag_freqs in counts.items():
|
||||
tag_str, mode = max(tag_freqs.items(), key=lambda item: item[1])
|
||||
n = sum(tag_freqs.values())
|
||||
word = int(word_str)
|
||||
tag = int(tag_str)
|
||||
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
|
||||
tagdict[word] = tag
|
||||
return tagdict
|
||||
|
||||
|
||||
cdef class_t _arg_max(weight_t* scores, int n_classes) except 9000:
|
||||
cdef int best = 0
|
||||
cdef weight_t score = scores[best]
|
||||
cdef int i
|
||||
for i in range(1, n_classes):
|
||||
if scores[i] >= score:
|
||||
score = scores[i]
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
cdef class_t _arg_max_among(weight_t* scores, list classes):
|
||||
cdef int best = classes[0]
|
||||
cdef weight_t score = scores[best]
|
||||
cdef class_t clas
|
||||
for clas in classes:
|
||||
if scores[clas] > score:
|
||||
score = scores[clas]
|
||||
best = clas
|
||||
return best
|
||||
|
|
|
@ -1,40 +1,55 @@
|
|||
import numpy as np
|
||||
cimport numpy as np
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from .lexeme cimport Lexeme
|
||||
from .typedefs cimport flag_t
|
||||
from .utf8string cimport StringStore
|
||||
from .tagger cimport TagType
|
||||
|
||||
from thinc.typedefs cimport atom_t
|
||||
from .typedefs cimport flags_t
|
||||
from .typedefs cimport Morphology
|
||||
from .lang cimport Language
|
||||
|
||||
|
||||
|
||||
cdef struct TokenC:
|
||||
const Lexeme* lex
|
||||
Morphology morph
|
||||
int idx
|
||||
int pos
|
||||
int lemma
|
||||
int sense
|
||||
|
||||
|
||||
ctypedef const Lexeme* const_Lexeme_ptr
|
||||
ctypedef TokenC* TokenC_ptr
|
||||
|
||||
ctypedef fused LexemeOrToken:
|
||||
const_Lexeme_ptr
|
||||
TokenC_ptr
|
||||
|
||||
|
||||
cdef class Tokens:
|
||||
cdef Pool mem
|
||||
cdef StringStore _string_store
|
||||
cdef Language lang
|
||||
cdef list tag_names
|
||||
|
||||
cdef Lexeme** _lex_ptr
|
||||
cdef int* _idx_ptr
|
||||
cdef int* _pos_ptr
|
||||
cdef int* _ner_ptr
|
||||
cdef Lexeme** lex
|
||||
cdef int* idx
|
||||
cdef int* pos
|
||||
cdef int* ner
|
||||
cdef TokenC* data
|
||||
|
||||
cdef int length
|
||||
cdef int max_length
|
||||
|
||||
cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
|
||||
cdef int push_back(self, int i, Lexeme* lexeme) except -1
|
||||
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1
|
||||
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
||||
|
||||
cpdef np.ndarray[long, ndim=2] get_array(self, list features)
|
||||
|
||||
|
||||
cdef class Token:
|
||||
cdef StringStore _string_store
|
||||
cdef public Language lang
|
||||
cdef public int i
|
||||
cdef public int idx
|
||||
cdef public int pos
|
||||
cdef public int ner
|
||||
cdef int pos
|
||||
cdef int lemma
|
||||
|
||||
cdef public atom_t id
|
||||
cdef public atom_t cluster
|
||||
|
@ -51,4 +66,4 @@ cdef class Token:
|
|||
|
||||
cdef public float prob
|
||||
|
||||
cdef public flag_t flags
|
||||
cdef public flags_t flags
|
||||
|
|
151
spacy/tokens.pyx
151
spacy/tokens.pyx
|
@ -1,7 +1,15 @@
|
|||
# cython: profile=True
|
||||
from preshed.maps cimport PreshMap
|
||||
from preshed.counter cimport PreshCounter
|
||||
|
||||
from .lexeme cimport *
|
||||
cimport cython
|
||||
from .tagger cimport POS, ENTITY
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as np
|
||||
|
||||
POS = 0
|
||||
ENTITY = 0
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
@ -17,23 +25,13 @@ cdef class Tokens:
|
|||
"""A sequence of references to Lexeme objects.
|
||||
|
||||
The Tokens class provides fast and memory-efficient access to lexical features,
|
||||
and can efficiently export the data to a numpy array. Specific languages
|
||||
create their own Tokens subclasses, to provide more convenient access to
|
||||
language-specific features.
|
||||
and can efficiently export the data to a numpy array.
|
||||
|
||||
>>> from spacy.en import EN
|
||||
>>> tokens = EN.tokenize('An example sentence.')
|
||||
>>> tokens.string(0)
|
||||
'An'
|
||||
>>> tokens.prob(0) > tokens.prob(1)
|
||||
True
|
||||
>>> tokens.can_noun(0)
|
||||
False
|
||||
>>> tokens.can_noun(1)
|
||||
True
|
||||
"""
|
||||
def __init__(self, StringStore string_store, string_length=0):
|
||||
self._string_store = string_store
|
||||
def __init__(self, Language lang, string_length=0):
|
||||
self.lang = lang
|
||||
if string_length >= 3:
|
||||
size = int(string_length / 3.0)
|
||||
else:
|
||||
|
@ -42,28 +40,18 @@ cdef class Tokens:
|
|||
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
||||
# However, we need to remember the true starting places, so that we can
|
||||
# realloc.
|
||||
self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
|
||||
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
||||
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
||||
self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
||||
self.lex = self._lex_ptr
|
||||
self.idx = self._idx_ptr
|
||||
self.pos = self._pos_ptr
|
||||
self.ner = self._ner_ptr
|
||||
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
||||
cdef int i
|
||||
for i in range(size + (PADDING*2)):
|
||||
self.lex[i] = &EMPTY_LEXEME
|
||||
self.lex += PADDING
|
||||
self.idx += PADDING
|
||||
self.pos += PADDING
|
||||
self.ner += PADDING
|
||||
data_start[i].lex = &EMPTY_LEXEME
|
||||
self.data = data_start + PADDING
|
||||
self.max_length = size
|
||||
self.length = 0
|
||||
|
||||
def __getitem__(self, i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
|
||||
self.lex[i][0])
|
||||
return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
|
||||
self.data[i].lemma, self.data[i].lex[0])
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(self.length):
|
||||
|
@ -72,70 +60,78 @@ cdef class Tokens:
|
|||
def __len__(self):
|
||||
return self.length
|
||||
|
||||
cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
|
||||
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
||||
if self.length == self.max_length:
|
||||
self._realloc(self.length * 2)
|
||||
self.lex[self.length] = lexeme
|
||||
self.idx[self.length] = idx
|
||||
self.pos[self.length] = 0
|
||||
self.ner[self.length] = 0
|
||||
self.length += 1
|
||||
return idx + lexeme.length
|
||||
|
||||
cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
|
||||
cdef int i
|
||||
if lexemes == NULL:
|
||||
return idx
|
||||
elif n == 0:
|
||||
i = 0
|
||||
while lexemes[i] != NULL:
|
||||
idx = self.push_back(idx, lexemes[i])
|
||||
i += 1
|
||||
cdef TokenC* t = &self.data[self.length]
|
||||
if LexemeOrToken is TokenC_ptr:
|
||||
t[0] = lex_or_tok[0]
|
||||
else:
|
||||
for i in range(n):
|
||||
idx = self.push_back(idx, lexemes[i])
|
||||
return idx
|
||||
t.lex = lex_or_tok
|
||||
self.length += 1
|
||||
return idx + t.lex.length
|
||||
|
||||
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
|
||||
if tag_type == POS:
|
||||
self.pos[i] = tag
|
||||
elif tag_type == ENTITY:
|
||||
self.ner[i] = tag
|
||||
@cython.boundscheck(False)
|
||||
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
|
||||
cdef int i, j
|
||||
cdef attr_id_t feature
|
||||
cdef np.ndarray[long, ndim=2] output
|
||||
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
|
||||
for i in range(self.length):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
output[i, j] = get_attr(self.data[i].lex, feature)
|
||||
return output
|
||||
|
||||
def count_by(self, attr_id_t attr_id):
|
||||
cdef int i
|
||||
cdef attr_t attr
|
||||
cdef size_t count
|
||||
|
||||
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
||||
for i in range(self.length):
|
||||
if attr_id == LEMMA:
|
||||
attr = self.data[i].lemma
|
||||
else:
|
||||
attr = get_attr(self.data[i].lex, attr_id)
|
||||
counts.inc(attr, 1)
|
||||
return dict(counts)
|
||||
|
||||
def _realloc(self, new_size):
|
||||
self.max_length = new_size
|
||||
n = new_size + (PADDING * 2)
|
||||
self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
|
||||
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
|
||||
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
|
||||
self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
|
||||
self.lex = self._lex_ptr + PADDING
|
||||
self.idx = self._idx_ptr + PADDING
|
||||
self.pos = self._pos_ptr + PADDING
|
||||
self.ner = self._ner_ptr + PADDING
|
||||
# What we're storing is a "padded" array. We've jumped forward PADDING
|
||||
# places, and are storing the pointer to that. This way, we can access
|
||||
# words out-of-bounds, and get out-of-bounds markers.
|
||||
# Now that we want to realloc, we need the address of the true start,
|
||||
# so we jump the pointer back PADDING places.
|
||||
cdef TokenC* data_start = self.data - PADDING
|
||||
data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
|
||||
self.data = data_start + PADDING
|
||||
cdef int i
|
||||
for i in range(self.length, self.max_length + PADDING):
|
||||
self.lex[i] = &EMPTY_LEXEME
|
||||
self.data[i].lex = &EMPTY_LEXEME
|
||||
|
||||
|
||||
@cython.freelist(64)
|
||||
cdef class Token:
|
||||
def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
|
||||
dict lex):
|
||||
self._string_store = string_store
|
||||
def __init__(self, Language lang, int i, int idx,
|
||||
int pos, int lemma, dict lex):
|
||||
self.lang = lang
|
||||
self.idx = idx
|
||||
self.pos = pos
|
||||
self.ner = ner
|
||||
self.i = i
|
||||
self.id = lex['id']
|
||||
|
||||
self.lemma = lemma
|
||||
|
||||
self.cluster = lex['cluster']
|
||||
self.length = lex['length']
|
||||
self.postype = lex['postype']
|
||||
self.sensetype = lex['supersense']
|
||||
self.postype = lex['pos_type']
|
||||
self.sensetype = 0
|
||||
self.sic = lex['sic']
|
||||
self.norm = lex['norm']
|
||||
self.norm = lex['dense']
|
||||
self.shape = lex['shape']
|
||||
self.suffix = lex['asciied']
|
||||
self.suffix = lex['suffix']
|
||||
self.prefix = lex['prefix']
|
||||
|
||||
self.prob = lex['prob']
|
||||
|
@ -145,5 +141,16 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
if self.sic == 0:
|
||||
return ''
|
||||
cdef bytes utf8string = self._string_store[self.sic]
|
||||
cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
|
||||
return utf8string.decode('utf8')
|
||||
|
||||
property lemma:
|
||||
def __get__(self):
|
||||
if self.lemma == 0:
|
||||
return self.string
|
||||
cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
|
||||
return utf8string.decode('utf8')
|
||||
|
||||
property pos:
|
||||
def __get__(self):
|
||||
return self.lang.pos_tagger.tag_names[self.pos]
|
||||
|
|
|
@ -1,8 +1,20 @@
|
|||
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
||||
from libc.stdint cimport uint8_t
|
||||
|
||||
ctypedef uint64_t hash_t
|
||||
ctypedef char* utf8_t
|
||||
ctypedef uint64_t flag_t
|
||||
ctypedef uint32_t attr_t
|
||||
ctypedef uint64_t flags_t
|
||||
ctypedef uint32_t id_t
|
||||
ctypedef uint16_t len_t
|
||||
ctypedef uint16_t tag_t
|
||||
|
||||
|
||||
cdef struct Morphology:
|
||||
uint8_t number
|
||||
uint8_t tenspect # Tense/aspect/voice
|
||||
uint8_t mood
|
||||
uint8_t gender
|
||||
uint8_t person
|
||||
uint8_t case
|
||||
uint8_t misc
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from preshed.maps cimport PreshMap
|
||||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from .typedefs cimport utf8_t, id_t, hash_t
|
||||
|
||||
|
@ -11,11 +12,23 @@ cdef struct Utf8Str:
|
|||
int length
|
||||
|
||||
|
||||
cdef struct UniStr:
|
||||
Py_UNICODE* chars
|
||||
size_t n
|
||||
hash_t key
|
||||
|
||||
|
||||
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||
s.chars = &chars[start]
|
||||
s.n = end - start
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
cdef Pool mem
|
||||
cdef PreshMap table
|
||||
cdef PreshMap _map
|
||||
cdef Utf8Str* strings
|
||||
cdef int size
|
||||
cdef int _resize_at
|
||||
|
||||
cdef Utf8Str* intern(self, char* chars, int length) except NULL
|
||||
cdef const Utf8Str* intern(self, char* chars, int length) except NULL
|
||||
|
|
|
@ -5,10 +5,11 @@ import codecs
|
|||
|
||||
SEPARATOR = '\n|-SEP-|\n'
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
def __init__(self):
|
||||
self.mem = Pool()
|
||||
self.table = PreshMap()
|
||||
self._map = PreshMap()
|
||||
self._resize_at = 10000
|
||||
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||
self.size = 1
|
||||
|
@ -17,26 +18,30 @@ cdef class StringStore:
|
|||
def __get__(self):
|
||||
return self.size-1
|
||||
|
||||
def __getitem__(self, string_or_id):
|
||||
def __getitem__(self, object string_or_id):
|
||||
cdef bytes byte_string
|
||||
cdef Utf8Str* utf8str
|
||||
if type(string_or_id) == int or type(string_or_id) == long:
|
||||
cdef const Utf8Str* utf8str
|
||||
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
|
||||
if string_or_id < 1 or string_or_id >= self.size:
|
||||
raise IndexError(string_or_id)
|
||||
utf8str = &self.strings[<int>string_or_id]
|
||||
return utf8str.chars[:utf8str.length]
|
||||
elif type(string_or_id) == bytes:
|
||||
elif isinstance(string_or_id, bytes):
|
||||
utf8str = self.intern(<char*>string_or_id, len(string_or_id))
|
||||
return utf8str.i
|
||||
elif isinstance(string_or_id, unicode):
|
||||
byte_string = string_or_id.encode('utf8')
|
||||
utf8str = self.intern(<char*>byte_string, len(byte_string))
|
||||
return utf8str.i
|
||||
else:
|
||||
raise TypeError(type(string_or_id))
|
||||
|
||||
cdef Utf8Str* intern(self, char* chars, int length) except NULL:
|
||||
cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
|
||||
# 0 means missing, but we don't bother offsetting the index. We waste
|
||||
# slot 0 to simplify the code, because it doesn't matter.
|
||||
assert length != 0
|
||||
cdef hash_t key = hash64(chars, length * sizeof(char), 0)
|
||||
cdef void* value = self.table.get(key)
|
||||
cdef void* value = self._map.get(key)
|
||||
cdef size_t i
|
||||
if value == NULL:
|
||||
if self.size == self._resize_at:
|
||||
|
@ -48,7 +53,7 @@ cdef class StringStore:
|
|||
self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
|
||||
memcpy(self.strings[i].chars, chars, length)
|
||||
self.strings[i].length = length
|
||||
self.table.set(key, <void*>self.size)
|
||||
self._map.set(key, <void*>self.size)
|
||||
self.size += 1
|
||||
else:
|
||||
i = <size_t>value
|
||||
|
|
|
@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
|
|||
|
||||
def read_lang_data(name):
|
||||
data_dir = path.join(DATA_DIR, name)
|
||||
tokenization = read_tokenization(name)
|
||||
with open(path.join(data_dir, 'specials.json')) as file_:
|
||||
tokenization = ujson.load(file_)
|
||||
prefix = read_prefix(data_dir)
|
||||
suffix = read_suffix(data_dir)
|
||||
infix = read_infix(data_dir)
|
||||
|
@ -26,12 +27,14 @@ def read_prefix(data_dir):
|
|||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||
return expression
|
||||
|
||||
|
||||
def read_suffix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
|
||||
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
||||
return expression
|
||||
|
||||
|
||||
def read_infix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'infix')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
|
|
|
@ -20,15 +20,18 @@ def test_apostrophe():
|
|||
def test_LL():
|
||||
tokens = EN.tokenize("we'll")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].string == "will"
|
||||
assert tokens[1].string == "'ll"
|
||||
assert tokens[1].lemma == "will"
|
||||
assert tokens[0].string == "we"
|
||||
|
||||
|
||||
def test_aint():
|
||||
tokens = EN.tokenize("ain't")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].string == "are"
|
||||
assert tokens[1].string == "not"
|
||||
assert tokens[0].string == "ai"
|
||||
assert tokens[0].lemma == "be"
|
||||
assert tokens[1].string == "n't"
|
||||
assert tokens[1].lemma == "not"
|
||||
|
||||
|
||||
def test_capitalized():
|
||||
|
@ -38,4 +41,12 @@ def test_capitalized():
|
|||
assert len(tokens) == 2
|
||||
tokens = EN.tokenize("Ain't")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].string == "Are"
|
||||
assert tokens[0].string == "Ai"
|
||||
assert tokens[0].lemma == "be"
|
||||
|
||||
|
||||
def test_punct():
|
||||
tokens = EN.tokenize("We've")
|
||||
assert len(tokens) == 2
|
||||
tokens = EN.tokenize("``We've")
|
||||
assert len(tokens) == 3
|
||||
|
|
|
@ -27,3 +27,9 @@ def test_tweebo_challenge():
|
|||
assert tokens[19].string == '")'
|
||||
assert tokens[20].string == ':>'
|
||||
assert tokens[21].string == '....'
|
||||
|
||||
|
||||
def test_false_positive():
|
||||
text = "example:)"
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 3
|
||||
|
|
|
@ -19,8 +19,12 @@ def test_save_bytes(sstore):
|
|||
|
||||
|
||||
def test_save_unicode(sstore):
|
||||
with pytest.raises(TypeError):
|
||||
A_i = sstore['A']
|
||||
Hello_i = sstore[u'Hello']
|
||||
assert Hello_i == 1
|
||||
assert sstore[u'Hello'] == 1
|
||||
assert sstore[u'goodbye'] != Hello_i
|
||||
assert sstore[u'hello'] != Hello_i
|
||||
assert Hello_i == 1
|
||||
|
||||
|
||||
def test_zero_id(sstore):
|
||||
|
|
15
tests/test_iter_lexicon.py
Normal file
15
tests/test_iter_lexicon.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
import pytest
|
||||
|
||||
from spacy.en import EN
|
||||
|
||||
def test_range_iter():
|
||||
EN.load()
|
||||
for i in range(len(EN.lexicon)):
|
||||
lex = EN.lexicon[i]
|
||||
|
||||
|
||||
def test_iter():
|
||||
EN.load()
|
||||
i = 0
|
||||
for lex in EN.lexicon:
|
||||
i += 1
|
34
tests/test_lemmatizer.py
Normal file
34
tests/test_lemmatizer.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
||||
from spacy.util import DATA_DIR
|
||||
from os import path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_read_index():
|
||||
wn = path.join(DATA_DIR, 'wordnet')
|
||||
index = read_index(path.join(wn, 'index.noun'))
|
||||
assert 'man' in index
|
||||
assert 'plantes' not in index
|
||||
assert 'plant' in index
|
||||
|
||||
|
||||
def test_read_exc():
|
||||
wn = path.join(DATA_DIR, 'wordnet')
|
||||
exc = read_exc(path.join(wn, 'verb.exc'))
|
||||
assert exc['was'] == ('be',)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer():
|
||||
return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
|
||||
|
||||
|
||||
def test_noun_lemmas(lemmatizer):
|
||||
do = lemmatizer.noun
|
||||
|
||||
assert do('aardwolves') == set(['aardwolf'])
|
||||
assert do('aardwolf') == set(['aardwolf'])
|
||||
assert do('planets') == set(['planet'])
|
||||
assert do('ring') == set(['ring'])
|
||||
assert do('axes') == set(['axis', 'axe', 'ax'])
|
|
@ -7,6 +7,7 @@ from spacy.lexeme import *
|
|||
|
||||
|
||||
def test_is_alpha():
|
||||
EN.load()
|
||||
the = EN.lexicon['the']
|
||||
assert the['flags'] & (1 << IS_ALPHA)
|
||||
year = EN.lexicon['1999']
|
||||
|
@ -16,6 +17,7 @@ def test_is_alpha():
|
|||
|
||||
|
||||
def test_is_digit():
|
||||
EN.load()
|
||||
the = EN.lexicon['the']
|
||||
assert not the['flags'] & (1 << IS_DIGIT)
|
||||
year = EN.lexicon['1999']
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
from spacy import util
|
||||
|
||||
|
||||
def test_load_en():
|
||||
rules = util.read_tokenization('en')
|
||||
assert len(rules) != 0
|
||||
aint = [rule for rule in rules if rule[0] == "ain't"][0]
|
||||
chunk, pieces = aint
|
||||
assert chunk == "ain't"
|
||||
assert pieces[0] == "are"
|
||||
assert pieces[1] == "not"
|
|
@ -34,7 +34,7 @@ def test_digits():
|
|||
def test_contraction():
|
||||
tokens = EN.tokenize("don't giggle")
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].sic == EN.lexicon["not"]['sic']
|
||||
assert tokens[1].sic == EN.lexicon["n't"]['sic']
|
||||
tokens = EN.tokenize("i said don't!")
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].sic == EN.lexicon['!']['sic']
|
||||
|
@ -71,30 +71,39 @@ def test_cnts1():
|
|||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 8
|
||||
|
||||
|
||||
def test_cnts2():
|
||||
text = u"""U.N. regulations are not a part of their concern."""
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 10
|
||||
|
||||
|
||||
def test_cnts3():
|
||||
text = u"“Isn't it?”"
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 6
|
||||
words = [t.string for t in tokens]
|
||||
assert len(words) == 6
|
||||
|
||||
|
||||
def test_cnts4():
|
||||
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 15
|
||||
words = [t.string for t in tokens]
|
||||
assert len(words) == 15
|
||||
|
||||
|
||||
def test_cnts5():
|
||||
text = """'Me too!', Mr. P. Delaware cried. """
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 11
|
||||
|
||||
|
||||
def test_cnts6():
|
||||
text = u'They ran about 10km.'
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 6
|
||||
words = [t.string for t in tokens]
|
||||
assert len(words) == 6
|
||||
|
||||
|
||||
#def test_cnts7():
|
||||
# text = 'But then the 6,000-year ice age came...'
|
||||
|
|
Loading…
Reference in New Issue
Block a user