mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
* Merge setup.py
This commit is contained in:
commit
ca54d58638
|
@ -11,3 +11,8 @@ $
|
||||||
'
|
'
|
||||||
``
|
``
|
||||||
`
|
`
|
||||||
|
#
|
||||||
|
US$
|
||||||
|
C$
|
||||||
|
A$
|
||||||
|
a-
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
,
|
,
|
||||||
"
|
\"
|
||||||
)
|
\)
|
||||||
]
|
\]
|
||||||
}
|
\}
|
||||||
*
|
\*
|
||||||
!
|
\!
|
||||||
?
|
\?
|
||||||
%
|
%
|
||||||
$
|
\$
|
||||||
>
|
>
|
||||||
:
|
:
|
||||||
;
|
;
|
||||||
|
@ -16,7 +16,8 @@ $
|
||||||
''
|
''
|
||||||
's
|
's
|
||||||
'S
|
'S
|
||||||
.
|
\.\.
|
||||||
..
|
\.\.\.
|
||||||
...
|
\.\.\.\.
|
||||||
....
|
(?<=[a-z0-9])\.
|
||||||
|
(?<=[0-9])km
|
||||||
|
|
|
@ -4,101 +4,9 @@
|
||||||
#*---* ---
|
#*---* ---
|
||||||
#*'s 's
|
#*'s 's
|
||||||
|
|
||||||
's 's
|
|
||||||
'S 'S
|
|
||||||
ain't are not
|
|
||||||
aren't are not
|
|
||||||
can't can not
|
|
||||||
cannot can not
|
|
||||||
could've could have
|
|
||||||
couldn't could not
|
|
||||||
couldn't've could not have
|
|
||||||
didn't did not
|
|
||||||
doesn't does not
|
|
||||||
don't do not
|
|
||||||
hadn't had not
|
|
||||||
hadn't've had not have
|
|
||||||
hasn't has not
|
|
||||||
haven't have not
|
|
||||||
he'd he would
|
|
||||||
he'd've he would have
|
|
||||||
he'll he will
|
|
||||||
he's he 's
|
|
||||||
how'd he would
|
|
||||||
how'll he will
|
|
||||||
how's how 's
|
|
||||||
I'd I would
|
|
||||||
I'd've I would have
|
|
||||||
I'll I will
|
|
||||||
I'm I am
|
|
||||||
I'ma I will
|
|
||||||
I've I have
|
|
||||||
isn't is not
|
|
||||||
it'd it would
|
|
||||||
it'd've it would have
|
|
||||||
it'll it will
|
|
||||||
it's it 's
|
|
||||||
let's let 's
|
|
||||||
mightn't might not
|
|
||||||
mightn't've might not have
|
|
||||||
might've might have
|
|
||||||
mustn't must not
|
|
||||||
must've must have
|
|
||||||
needn't need not
|
|
||||||
not've not have
|
|
||||||
shan't shall not
|
|
||||||
she'd she would
|
|
||||||
she'd've she would have
|
|
||||||
she'll she will
|
|
||||||
she's she 's
|
|
||||||
should've should have
|
|
||||||
shouldn't should not
|
|
||||||
shouldn't've should not have
|
|
||||||
that's that 's
|
|
||||||
there'd there would
|
|
||||||
there'd've there would have
|
|
||||||
there's there is
|
|
||||||
they'd there would
|
|
||||||
they'd've they would have
|
|
||||||
they'll they will
|
|
||||||
they're they are
|
|
||||||
they've they have
|
|
||||||
wasn't was not
|
|
||||||
we'd we would
|
|
||||||
we'd've we would have
|
|
||||||
we'll we will
|
|
||||||
we're we are
|
|
||||||
we've we have
|
|
||||||
weren't were not
|
|
||||||
what'll what will
|
|
||||||
what're what are
|
|
||||||
what's what 's
|
|
||||||
what've what have
|
|
||||||
when's when 's
|
|
||||||
where'd where would
|
|
||||||
where's where 's
|
|
||||||
where've where have
|
|
||||||
who'd who would
|
|
||||||
who'll who will
|
|
||||||
who're who are
|
|
||||||
who's who 's
|
|
||||||
who've who have
|
|
||||||
why'll who will
|
|
||||||
why're why are
|
|
||||||
why's why 's
|
|
||||||
won't will not
|
|
||||||
would've would have
|
|
||||||
wouldn't would not
|
|
||||||
wouldn't've would not have
|
|
||||||
you'd you would
|
|
||||||
you'd've you would have
|
|
||||||
you'll you will
|
|
||||||
you're you are
|
|
||||||
you've you have
|
|
||||||
'em them
|
|
||||||
'ol old
|
|
||||||
10km 10 km
|
10km 10 km
|
||||||
U.S. U.S.
|
U.S. U.S.
|
||||||
|
U.K. U.K.
|
||||||
non-U.S. non-U.S.
|
non-U.S. non-U.S.
|
||||||
U.N. U.N.
|
U.N. U.N.
|
||||||
Co. Co.
|
Co. Co.
|
||||||
|
@ -115,7 +23,12 @@ A.G. A.G.
|
||||||
Rep. Rep.
|
Rep. Rep.
|
||||||
Ms. Ms.
|
Ms. Ms.
|
||||||
Mr. Mr.
|
Mr. Mr.
|
||||||
|
Mrs. Mrs.
|
||||||
a.m. a.m.
|
a.m. a.m.
|
||||||
|
Sen. Sen.
|
||||||
|
INC. INC.
|
||||||
|
CO. CO.
|
||||||
|
COS. COS.
|
||||||
p.m. p.m.
|
p.m. p.m.
|
||||||
Nos. Nos.
|
Nos. Nos.
|
||||||
a.k.a. a.k.a.
|
a.k.a. a.k.a.
|
||||||
|
@ -127,6 +40,7 @@ E. E.
|
||||||
F. F.
|
F. F.
|
||||||
G. G.
|
G. G.
|
||||||
H. H.
|
H. H.
|
||||||
|
I. I.
|
||||||
J. J.
|
J. J.
|
||||||
K. K.
|
K. K.
|
||||||
L. L.
|
L. L.
|
||||||
|
@ -205,6 +119,9 @@ Wash. Wash.
|
||||||
W.Va. W.Va.
|
W.Va. W.Va.
|
||||||
Wis. Wis.
|
Wis. Wis.
|
||||||
Wyo. Wyo.
|
Wyo. Wyo.
|
||||||
|
L.A. L.A.
|
||||||
|
R.H. R.H.
|
||||||
|
Gov. Gov.
|
||||||
'' ''
|
'' ''
|
||||||
:) :)
|
:) :)
|
||||||
<3 <3
|
<3 <3
|
||||||
|
@ -262,3 +179,19 @@ V_V V_V
|
||||||
o.O o.O
|
o.O o.O
|
||||||
") ")
|
") ")
|
||||||
.... ....
|
.... ....
|
||||||
|
a- a -
|
||||||
|
Messrs. Messrs.
|
||||||
|
No. No.
|
||||||
|
vs. vs.
|
||||||
|
Gen. Gen.
|
||||||
|
Cos. Cos.
|
||||||
|
L.J. L.J.
|
||||||
|
D.T. D.T.
|
||||||
|
Prof. Prof.
|
||||||
|
Bros. Bros.
|
||||||
|
J.C. J.C.
|
||||||
|
Neb. Neb.
|
||||||
|
Adm. Adm.
|
||||||
|
U.S.S.R. U.S.S.R.
|
||||||
|
Rev. Rev.
|
||||||
|
H.F. H.F.
|
||||||
|
|
|
@ -3,45 +3,228 @@
|
||||||
You can adapt this file completely to your liking, but it should at least
|
You can adapt this file completely to your liking, but it should at least
|
||||||
contain the root `toctree` directive.
|
contain the root `toctree` directive.
|
||||||
|
|
||||||
|
================================
|
||||||
spaCy NLP Tokenizer and Lexicon
|
spaCy NLP Tokenizer and Lexicon
|
||||||
================================
|
================================
|
||||||
|
|
||||||
spaCy is a library for industrial strength NLP in Python. Its core
|
spaCy is a library for industrial-strength NLP in Python and Cython. spaCy's
|
||||||
values are:
|
take on NLP is that it's mostly about feature extraction --- that's the part
|
||||||
|
that's specific to NLP, so that's what an NLP library should focus on.
|
||||||
|
|
||||||
* **Efficiency**: You won't find faster NLP tools. For shallow analysis, it's 10x
|
spaCy also believes that for NLP, **efficiency is critical**. If you're
|
||||||
faster than Stanford Core NLP, and over 200x faster than NLTK. Its parser is
|
running batch jobs, you probably have an enormous amount of data; if you're
|
||||||
over 100x faster than Stanford's.
|
serving requests one-by-one, you want lower latency and fewer servers. Even if
|
||||||
|
you're doing exploratory research on relatively small samples, you should still
|
||||||
|
value efficiency, because it means you can run more experiments.
|
||||||
|
|
||||||
* **Accuracy**: All spaCy tools are within 0.5% of the current published
|
Depending on the task, spaCy is between 10 and 200 times faster than NLTK,
|
||||||
state-of-the-art, on both news and web text. NLP moves fast, so always check
|
often with much better accuracy. See Benchmarks for details, and
|
||||||
the numbers --- and don't settle for tools that aren't backed by
|
Why is spaCy so fast? for a discussion of the algorithms and implementation
|
||||||
rigorous recent evaluation.
|
that makes this possible.
|
||||||
|
|
||||||
* **Minimalism**: This isn't a library that covers 43 known algorithms to do X. You
|
+---------+----------+-------------+----------+
|
||||||
get 1 --- the best one --- with a simple, low-level interface. This keeps the
|
| System | Tokenize | --> Counts | --> Stem |
|
||||||
code-base small and concrete. Our Python APIs use lists and
|
+---------+----------+-------------+----------+
|
||||||
dictionaries, and our C/Cython APIs use arrays and simple structs.
|
| spaCy | 1m42s | 1m59s | 1m59s |
|
||||||
|
+---------+----------+-------------+----------+
|
||||||
|
| NLTK | 20m2s | 28m24s | 52m28 |
|
||||||
|
+---------+----------+-------------+----------+
|
||||||
|
|
||||||
|
Times for 100m words of text.
|
||||||
|
|
||||||
|
|
||||||
Comparison
|
Unique Lexicon-centric design
|
||||||
----------
|
=============================
|
||||||
|
|
||||||
+----------------+-------------+--------+---------------+--------------+
|
spaCy helps you build models that generalise better, by making it easy to use
|
||||||
| Tokenize & Tag | Speed (w/s) | Memory | % Acc. (news) | % Acc. (web) |
|
more robust features. Instead of a list of strings, the tokenizer returns
|
||||||
+----------------+-------------+--------+---------------+--------------+
|
references to rich lexical types. Features which ask about the word's Brown cluster,
|
||||||
| spaCy | 107,000 | 1.3gb | 96.7 | |
|
its typical part-of-speech tag, how it's usually cased etc require no extra effort:
|
||||||
+----------------+-------------+--------+---------------+--------------+
|
|
||||||
| Stanford | 8,000 | 1.5gb | 96.7 | |
|
>>> from spacy.en import EN
|
||||||
+----------------+-------------+--------+---------------+--------------+
|
>>> from spacy.feature_names import *
|
||||||
| NLTK | 543 | 61mb | 94.0 | |
|
>>> feats = (
|
||||||
+----------------+-------------+--------+---------------+--------------+
|
SIC, # ID of the original word form
|
||||||
|
STEM, # ID of the stemmed word form
|
||||||
|
CLUSTER, # ID of the word's Brown cluster
|
||||||
|
IS_TITLE, # Was the word title-cased?
|
||||||
|
POS_TYPE # A cluster ID describing what POS tags the word is usually assigned
|
||||||
|
)
|
||||||
|
>>> tokens = EN.tokenize(u'Split words, punctuation, emoticons etc.! ^_^')
|
||||||
|
>>> tokens.to_array(feats)[:5]
|
||||||
|
array([[ 1, 2, 3, 4],
|
||||||
|
[...],
|
||||||
|
[...],
|
||||||
|
[...]])
|
||||||
|
|
||||||
|
|
||||||
|
spaCy is designed to **make the right thing easy**, where the right thing is to:
|
||||||
|
|
||||||
|
* **Use rich distributional and orthographic features**. Without these, your model
|
||||||
|
will be very brittle and domain dependent.
|
||||||
|
|
||||||
|
* **Compute features per type, not per token**. Because of Zipf's law, you can
|
||||||
|
expect this to be exponentially more efficient.
|
||||||
|
|
||||||
|
* **Minimize string processing**, and instead compute with arrays of ID ints.
|
||||||
|
|
||||||
|
For the current list of lexical features, see `Lexical Features`_.
|
||||||
|
|
||||||
|
.. _lexical features: features.html
|
||||||
|
|
||||||
|
Tokenization done right
|
||||||
|
=======================
|
||||||
|
|
||||||
|
Most tokenizers rely on complicated regular expressions. Often, they leave you
|
||||||
|
with no way to align the tokens back to the original string --- a vital feature
|
||||||
|
if you want to display some mark-up, such as spelling correction. The regular
|
||||||
|
expressions also interact, making it hard to accommodate special cases.
|
||||||
|
|
||||||
|
spaCy introduces a **novel tokenization algorithm** that's much faster and much
|
||||||
|
more flexible:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
def tokenize(string, prefixes={}, suffixes={}, specials={}):
|
||||||
|
'''Sketch of spaCy's tokenization algorithm.'''
|
||||||
|
tokens = []
|
||||||
|
cache = {}
|
||||||
|
for chunk in string.split():
|
||||||
|
# Because of Zipf's law, the cache serves the majority of "chunks".
|
||||||
|
if chunk in cache:
|
||||||
|
tokens.extend(cache[chunl])
|
||||||
|
continue
|
||||||
|
key = chunk
|
||||||
|
|
||||||
|
subtokens = []
|
||||||
|
# Process a chunk by splitting off prefixes e.g. ( " { and suffixes e.g. , . :
|
||||||
|
# If we split one off, check whether we're left with a special-case,
|
||||||
|
# e.g. contractions (can't, won't, etc), emoticons, abbreviations, etc.
|
||||||
|
# This makes the tokenization easy to update and customize.
|
||||||
|
while chunk:
|
||||||
|
prefix, chunk = _consume_prefix(chunk, prefixes)
|
||||||
|
if prefix:
|
||||||
|
subtokens.append(prefix)
|
||||||
|
if chunk in specials:
|
||||||
|
subtokens.extend(specials[chunk])
|
||||||
|
break
|
||||||
|
suffix, chunk = _consume_suffix(chunk, suffixes)
|
||||||
|
if suffix:
|
||||||
|
subtokens.append(suffix)
|
||||||
|
if chunk in specials:
|
||||||
|
subtokens.extend(specials[chunk])
|
||||||
|
break
|
||||||
|
cache[key] = subtokens
|
||||||
|
|
||||||
|
Your data is going to have its own quirks, so it's really useful to have
|
||||||
|
a tokenizer you can easily control. To see the limitations of the standard
|
||||||
|
regex-based approach, check out `CMU's recent work on tokenizing tweets <http://www.ark.cs.cmu.edu/TweetNLP/>`_. Despite a lot of careful attention, they can't handle all of their
|
||||||
|
known emoticons correctly --- doing so would interfere with the way they
|
||||||
|
process other punctuation. This isn't a problem for spaCy: we just add them
|
||||||
|
all to the special tokenization rules.
|
||||||
|
|
||||||
|
spaCy's tokenizer is also incredibly efficient:
|
||||||
|
|
||||||
|
spaCy can create an inverted index of the 1.8 billion word Gigaword corpus,
|
||||||
|
in under half an hour --- on a Macbook Air. See the `inverted
|
||||||
|
index tutorial`_.
|
||||||
|
|
||||||
|
.. _inverted index tutorial: index_tutorial.html
|
||||||
|
|
||||||
|
Comparison with NLTK
|
||||||
|
====================
|
||||||
|
|
||||||
|
`NLTK <http://nltk.org>`_ provides interfaces to a wide-variety of NLP
|
||||||
|
tools and resources, and its own implementations of a few algorithms. It comes
|
||||||
|
with comprehensive documentation, and a book introducing concepts in NLP. For
|
||||||
|
these reasons, it's very widely known. However, if you're trying to make money
|
||||||
|
or do cutting-edge research, NLTK is not a good choice.
|
||||||
|
|
||||||
|
The `list of stuff in NLTK <http://www.nltk.org/py-modindex.html>`_ looks impressive,
|
||||||
|
but almost none of it is useful for real work. You're not going to make any money,
|
||||||
|
or do top research, by using the NLTK chat bots, theorem provers, toy CCG implementation,
|
||||||
|
etc. Most of NLTK is there to assist in the explanation ideas in computational
|
||||||
|
linguistics, at roughly an undergraduate level.
|
||||||
|
But it also claims to support serious work, by wrapping external tools.
|
||||||
|
|
||||||
|
In a pretty well known essay, Joel Spolsky discusses the pain of dealing with
|
||||||
|
`leaky abstractions <http://www.joelonsoftware.com/articles/LeakyAbstractions.html>`_.
|
||||||
|
An abstraction tells you to not care about implementation
|
||||||
|
details, but sometimes the implementation matters after all. When it
|
||||||
|
does, you have to waste time revising your assumptions.
|
||||||
|
|
||||||
|
NLTK's wrappers call external tools via subprocesses, and wrap this up so
|
||||||
|
that it looks like a native API. This abstraction leaks *a lot*. The system
|
||||||
|
calls impose far more overhead than a normal Python function call, which makes
|
||||||
|
the most natural way to program against the API infeasible.
|
||||||
|
|
||||||
|
|
||||||
|
Case study: POS tagging
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
Here's a quick comparison of the following POS taggers:
|
||||||
|
|
||||||
|
* **Stanford (CLI)**: The Stanford POS tagger, invoked once as a batch process
|
||||||
|
from the command-line;
|
||||||
|
* **nltk.tag.stanford**: The Stanford tagger, invoked document-by-document via
|
||||||
|
NLTK's wrapper;
|
||||||
|
* **nltk.pos_tag**: NLTK's own POS tagger, invoked document-by-document.
|
||||||
|
* **spacy.en.pos_tag**: spaCy's POS tagger, invoked document-by-document.
|
||||||
|
|
||||||
|
|
||||||
|
+-------------------+-------------+--------+
|
||||||
|
| System | Speed (w/s) | % Acc. |
|
||||||
|
+-------------------+-------------+--------+
|
||||||
|
| spaCy | 107,000 | 96.7 |
|
||||||
|
+-------------------+-------------+--------+
|
||||||
|
| Stanford (CLI) | 8,000 | 96.7 |
|
||||||
|
+-------------------+-------------+--------+
|
||||||
|
| nltk.pos_tag | 543 | 94.0 |
|
||||||
|
+-------------------+-------------+--------+
|
||||||
|
| nltk.tag.stanford | 209 | 96.7 |
|
||||||
|
+-------------------+-------------+--------+
|
||||||
|
|
||||||
|
Experimental details TODO. Three things are apparent from this comparison:
|
||||||
|
|
||||||
|
1. The native NLTK tagger, nltk.pos_tag, is both slow and inaccurate;
|
||||||
|
|
||||||
|
2. Calling the Stanford tagger document-by-document via NLTK is **40x** slower
|
||||||
|
than invoking the model once as a batch process, via the command-line;
|
||||||
|
|
||||||
|
3. spaCy is over 10x faster than the Stanford tagger, even when called
|
||||||
|
**sentence-by-sentence**.
|
||||||
|
|
||||||
|
The problem is that NLTK simply wraps the command-line
|
||||||
|
interfaces of these tools, so communication is via a subprocess. NLTK does not
|
||||||
|
even hold open a pipe for you --- the model is reloaded, again and again.
|
||||||
|
|
||||||
|
To use the wrapper effectively, you should batch up your text as much as possible.
|
||||||
|
This probably isn't how you would like to structure your pipeline, and you
|
||||||
|
might not be able to batch up much text at all, e.g. if serving a single
|
||||||
|
request means processing a single document.
|
||||||
|
Technically, NLTK does give you Python functions to access lots of different
|
||||||
|
systems --- but, you can't use them as you would expect to use a normal Python
|
||||||
|
function. The abstraction leaks.
|
||||||
|
|
||||||
|
Here's the bottom-line: the Stanford tools are written in Java, so using them
|
||||||
|
from Python sucks. You shouldn't settle for this. It's a problem that springs
|
||||||
|
purely from the tooling, rather than the domain.
|
||||||
|
|
||||||
|
Summary
|
||||||
|
-------
|
||||||
|
|
||||||
|
NLTK is a well-known Python library for NLP, but for the important bits, you
|
||||||
|
don't get actual Python modules. You get wrappers which throw to external
|
||||||
|
tools, via subprocesses. This is not at all the same thing.
|
||||||
|
|
||||||
|
spaCy is implemented in Cython, just like numpy, scikit-learn, lxml and other
|
||||||
|
high-performance Python libraries. So you get a native Python API, but the
|
||||||
|
performance you expect from a program written in C.
|
||||||
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:hidden:
|
:hidden:
|
||||||
:maxdepth: 3
|
:maxdepth: 3
|
||||||
|
|
||||||
what/index.rst
|
features.rst
|
||||||
why/index.rst
|
license_stories.rst
|
||||||
how/index.rst
|
|
||||||
|
|
26
setup.py
26
setup.py
|
@ -10,6 +10,8 @@ import os.path
|
||||||
from os import path
|
from os import path
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
|
||||||
def clean(ext):
|
def clean(ext):
|
||||||
for pyx in ext.sources:
|
for pyx in ext.sources:
|
||||||
|
@ -34,7 +36,7 @@ compile_args = []
|
||||||
link_args = []
|
link_args = []
|
||||||
libs = []
|
libs = []
|
||||||
|
|
||||||
includes = ['.']
|
includes = ['.', numpy.get_include()]
|
||||||
cython_includes = ['.']
|
cython_includes = ['.']
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,18 +52,20 @@ exts = [
|
||||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
|
|
||||||
Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
|
|
||||||
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.morphology", ["spacy/morphology.pyx"], language="c++",
|
||||||
Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
|
include_dirs=includes),
|
||||||
|
#Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
|
||||||
|
#Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
|
||||||
|
#Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
|
||||||
|
#Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes),
|
||||||
#Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,66 +0,0 @@
|
||||||
from thinc.typedefs cimport atom_t
|
|
||||||
from .typedefs cimport hash_t
|
|
||||||
from .tokens cimport Tokens
|
|
||||||
from .lexeme cimport Lexeme
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
|
||||||
cdef readonly atom_t sic
|
|
||||||
cdef readonly atom_t cluster
|
|
||||||
cdef readonly atom_t norm
|
|
||||||
cdef readonly atom_t shape
|
|
||||||
cdef readonly atom_t asciied
|
|
||||||
cdef readonly atom_t prefix
|
|
||||||
cdef readonly atom_t suffix
|
|
||||||
cdef readonly atom_t length
|
|
||||||
|
|
||||||
cdef readonly atom_t postype
|
|
||||||
cdef readonly atom_t nertype
|
|
||||||
cdef readonly atom_t sensetype
|
|
||||||
|
|
||||||
cdef readonly atom_t is_alpha
|
|
||||||
cdef readonly atom_t is_ascii
|
|
||||||
cdef readonly atom_t is_digit
|
|
||||||
cdef readonly atom_t is_lower
|
|
||||||
cdef readonly atom_t is_punct
|
|
||||||
cdef readonly atom_t is_space
|
|
||||||
cdef readonly atom_t is_title
|
|
||||||
cdef readonly atom_t is_upper
|
|
||||||
cdef readonly atom_t like_url
|
|
||||||
cdef readonly atom_t like_number
|
|
||||||
cdef readonly atom_t oft_lower
|
|
||||||
cdef readonly atom_t oft_title
|
|
||||||
cdef readonly atom_t oft_upper
|
|
||||||
|
|
||||||
cdef readonly atom_t in_males
|
|
||||||
cdef readonly atom_t in_females
|
|
||||||
cdef readonly atom_t in_surnames
|
|
||||||
cdef readonly atom_t in_places
|
|
||||||
cdef readonly atom_t in_games
|
|
||||||
cdef readonly atom_t in_celebs
|
|
||||||
cdef readonly atom_t in_names
|
|
||||||
|
|
||||||
cdef readonly atom_t pos
|
|
||||||
cdef readonly atom_t sense
|
|
||||||
cdef readonly atom_t ner
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Slots:
|
|
||||||
cdef readonly Token P4
|
|
||||||
cdef readonly Token P3
|
|
||||||
cdef readonly Token P2
|
|
||||||
cdef readonly Token P1
|
|
||||||
cdef readonly Token N0
|
|
||||||
cdef readonly Token N1
|
|
||||||
cdef readonly Token N2
|
|
||||||
cdef readonly Token N3
|
|
||||||
cdef readonly Token N4
|
|
||||||
|
|
||||||
|
|
||||||
cdef int N_FIELDS
|
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
|
|
||||||
|
|
||||||
|
|
||||||
cpdef Slots FIELD_IDS
|
|
|
@ -1,126 +0,0 @@
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
from .lexeme cimport *
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Slots:
|
|
||||||
def __init__(self):
|
|
||||||
self.P4 = Token()
|
|
||||||
self.P3 = Token()
|
|
||||||
self.P2 = Token()
|
|
||||||
self.P1 = Token()
|
|
||||||
self.N0 = Token()
|
|
||||||
self.N1 = Token()
|
|
||||||
self.N2 = Token()
|
|
||||||
self.N3 = Token()
|
|
||||||
self.N4 = Token()
|
|
||||||
|
|
||||||
|
|
||||||
cdef void _number_token(Token t, int* n_fields):
|
|
||||||
cdef int i = n_fields[0]
|
|
||||||
t.sic = i; i += 1
|
|
||||||
t.cluster = i; i += 1
|
|
||||||
t.norm = i; i += 1
|
|
||||||
t.shape = i; i += 1
|
|
||||||
t.prefix = i; i += 1
|
|
||||||
t.suffix = i; i += 1
|
|
||||||
t.length = i; i += 1
|
|
||||||
|
|
||||||
t.postype = i; i += 1
|
|
||||||
t.nertype = i; i += 1
|
|
||||||
t.sensetype = i; i += 1
|
|
||||||
|
|
||||||
t.is_alpha = i; i += 1
|
|
||||||
t.is_ascii = i; i += 1
|
|
||||||
t.is_digit = i; i += 1
|
|
||||||
t.is_lower = i; i += 1
|
|
||||||
t.is_punct = i; i += 1
|
|
||||||
t.is_space = i; i += 1
|
|
||||||
t.is_title = i; i += 1
|
|
||||||
t.is_upper = i; i += 1
|
|
||||||
|
|
||||||
t.like_number = i; i += 1
|
|
||||||
t.like_url = i; i += 1
|
|
||||||
|
|
||||||
t.oft_lower = i; i += 1
|
|
||||||
t.oft_title = i; i += 1
|
|
||||||
t.oft_upper = i; i += 1
|
|
||||||
|
|
||||||
t.in_males = i; i += 1
|
|
||||||
t.in_females = i; i += 1
|
|
||||||
t.in_surnames = i; i += 1
|
|
||||||
t.in_places = i; i += 1
|
|
||||||
t.in_games = i; i += 1
|
|
||||||
t.in_celebs = i; i += 1
|
|
||||||
t.in_names = i; i += 1
|
|
||||||
|
|
||||||
t.pos = i; i += 1
|
|
||||||
t.sense = i; i += 1
|
|
||||||
t.ner = i; i += 1
|
|
||||||
|
|
||||||
n_fields[0] = i
|
|
||||||
|
|
||||||
|
|
||||||
cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner):
|
|
||||||
c[t.sic] = lex.sic
|
|
||||||
c[t.cluster] = lex.cluster
|
|
||||||
c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
|
||||||
c[t.shape] = lex.shape
|
|
||||||
c[t.asciied] = lex.asciied
|
|
||||||
c[t.prefix] = lex.prefix
|
|
||||||
c[t.suffix] = lex.suffix
|
|
||||||
c[t.length] = lex.length
|
|
||||||
|
|
||||||
c[t.postype] = lex.postype
|
|
||||||
c[t.nertype] = 0
|
|
||||||
c[t.sensetype] = 0
|
|
||||||
|
|
||||||
c[t.is_alpha] = lex.flags & (1 << IS_ALPHA)
|
|
||||||
c[t.is_digit] = lex.flags & (1 << IS_DIGIT)
|
|
||||||
c[t.is_lower] = lex.flags & (1 << IS_LOWER)
|
|
||||||
c[t.is_punct] = lex.flags & (1 << IS_PUNCT)
|
|
||||||
c[t.is_space] = lex.flags & (1 << IS_SPACE)
|
|
||||||
c[t.is_title] = lex.flags & (1 << IS_TITLE)
|
|
||||||
c[t.is_upper] = lex.flags & (1 << IS_UPPER)
|
|
||||||
c[t.like_url] = lex.flags & (1 << LIKE_URL)
|
|
||||||
c[t.like_number] = lex.flags & (1 << LIKE_NUMBER)
|
|
||||||
c[t.oft_lower] = lex.flags & (1 << OFT_LOWER)
|
|
||||||
c[t.oft_title] = lex.flags & (1 << OFT_TITLE)
|
|
||||||
c[t.oft_upper] = lex.flags & (1 << OFT_UPPER)
|
|
||||||
|
|
||||||
c[t.in_males] = lex.flags & (1 << IN_MALES)
|
|
||||||
c[t.in_females] = lex.flags & (1 << IN_FEMALES)
|
|
||||||
c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES)
|
|
||||||
c[t.in_places] = lex.flags & (1 << IN_PLACES)
|
|
||||||
c[t.in_games] = lex.flags & (1 << IN_GAMES)
|
|
||||||
c[t.in_celebs] = lex.flags & (1 << IN_CELEBS)
|
|
||||||
c[t.in_names] = lex.flags & (1 << IN_NAMES)
|
|
||||||
|
|
||||||
c[t.pos] = pos
|
|
||||||
c[t.sense] = 0
|
|
||||||
c[t.ner] = ner
|
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
|
|
||||||
_fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4])
|
|
||||||
_fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3])
|
|
||||||
_fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
|
|
||||||
_fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
|
|
||||||
_fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
|
|
||||||
_fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
|
|
||||||
_fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
|
|
||||||
_fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3])
|
|
||||||
_fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4])
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
N_FIELDS = 0
|
|
||||||
FIELD_IDS = Slots()
|
|
||||||
_number_token(FIELD_IDS.P4, &N_FIELDS)
|
|
||||||
_number_token(FIELD_IDS.P3, &N_FIELDS)
|
|
||||||
_number_token(FIELD_IDS.P2, &N_FIELDS)
|
|
||||||
_number_token(FIELD_IDS.P1, &N_FIELDS)
|
|
||||||
_number_token(FIELD_IDS.N0, &N_FIELDS)
|
|
||||||
_number_token(FIELD_IDS.N1, &N_FIELDS)
|
|
||||||
_number_token(FIELD_IDS.N2, &N_FIELDS)
|
|
||||||
_number_token(FIELD_IDS.N3, &N_FIELDS)
|
|
||||||
_number_token(FIELD_IDS.N4, &N_FIELDS)
|
|
132
spacy/en.pxd
132
spacy/en.pxd
|
@ -1,5 +1,133 @@
|
||||||
from spacy.lang cimport Language
|
from thinc.typedefs cimport atom_t
|
||||||
from spacy.tokens cimport Tokens
|
|
||||||
|
from .lang cimport Language
|
||||||
|
from .tokens cimport Tokens
|
||||||
|
from .tokens cimport TokenC
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_person_t:
|
||||||
|
NO_PERSON
|
||||||
|
FIRST
|
||||||
|
SECOND
|
||||||
|
THIRD
|
||||||
|
NON_THIRD
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_number_t:
|
||||||
|
NO_NUMBER
|
||||||
|
SINGULAR
|
||||||
|
PLURAL
|
||||||
|
MASS
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_gender_t:
|
||||||
|
NO_GENDER
|
||||||
|
MASCULINE
|
||||||
|
FEMININE
|
||||||
|
NEUTER
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_case_t:
|
||||||
|
NO_CASE
|
||||||
|
NOMINATIVE
|
||||||
|
GENITIVE
|
||||||
|
ACCUSATIVE
|
||||||
|
REFLEXIVE
|
||||||
|
DEMONYM
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum en_tenspect_t:
|
||||||
|
NO_TENSE
|
||||||
|
BASE_VERB
|
||||||
|
PRESENT
|
||||||
|
PAST
|
||||||
|
PASSIVE
|
||||||
|
ING
|
||||||
|
MODAL
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum misc_t:
|
||||||
|
NO_MISC
|
||||||
|
COMPARATIVE
|
||||||
|
SUPERLATIVE
|
||||||
|
RELATIVE
|
||||||
|
NAME
|
||||||
|
|
||||||
|
|
||||||
|
# Flags
|
||||||
|
cpdef enum FlagID:
|
||||||
|
IS_ALPHA
|
||||||
|
IS_ASCII
|
||||||
|
IS_DIGIT
|
||||||
|
IS_LOWER
|
||||||
|
IS_PUNCT
|
||||||
|
IS_SPACE
|
||||||
|
IS_TITLE
|
||||||
|
IS_UPPER
|
||||||
|
|
||||||
|
LIKE_URL
|
||||||
|
LIKE_NUMBER
|
||||||
|
|
||||||
|
OFT_LOWER
|
||||||
|
OFT_TITLE
|
||||||
|
OFT_UPPER
|
||||||
|
|
||||||
|
IN_MALES
|
||||||
|
IN_FEMALES
|
||||||
|
IN_SURNAMES
|
||||||
|
IN_PLACES
|
||||||
|
IN_GAMES
|
||||||
|
IN_CELEBS
|
||||||
|
IN_NAMES
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum:
|
||||||
|
P2_sic
|
||||||
|
P2_cluster
|
||||||
|
P2_shape
|
||||||
|
P2_prefix
|
||||||
|
P2_suffix
|
||||||
|
P2_pos
|
||||||
|
P2_lemma
|
||||||
|
P2_pos_type
|
||||||
|
|
||||||
|
P1_sic
|
||||||
|
P1_cluster
|
||||||
|
P1_shape
|
||||||
|
P1_prefix
|
||||||
|
P1_suffix
|
||||||
|
P1_pos
|
||||||
|
P1_lemma
|
||||||
|
P1_pos_type
|
||||||
|
|
||||||
|
W_sic
|
||||||
|
W_cluster
|
||||||
|
W_shape
|
||||||
|
W_prefix
|
||||||
|
W_suffix
|
||||||
|
W_pos
|
||||||
|
W_lemma
|
||||||
|
W_pos_type
|
||||||
|
|
||||||
|
N1_sic
|
||||||
|
N1_cluster
|
||||||
|
N1_shape
|
||||||
|
N1_prefix
|
||||||
|
N1_suffix
|
||||||
|
N1_pos
|
||||||
|
N1_lemma
|
||||||
|
N1_pos_type
|
||||||
|
|
||||||
|
N2_sic
|
||||||
|
N2_cluster
|
||||||
|
N2_shape
|
||||||
|
N2_prefix
|
||||||
|
N2_suffix
|
||||||
|
N2_pos
|
||||||
|
N2_lemma
|
||||||
|
N2_pos_type
|
||||||
|
|
||||||
|
N_CONTEXT_FIELDS
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
|
|
160
spacy/en.pyx
160
spacy/en.pyx
|
@ -30,14 +30,101 @@ same scheme. Tokenization problems are a major cause of poor performance for
|
||||||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
||||||
provides a fully Penn Treebank 3-compliant tokenizer.
|
provides a fully Penn Treebank 3-compliant tokenizer.
|
||||||
'''
|
'''
|
||||||
# TODO
|
|
||||||
#The script translate_treebank_tokenization can be used to transform a treebank's
|
|
||||||
#annotation to use one of the spacy tokenization schemes.
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
cimport lang
|
cimport lang
|
||||||
|
from .typedefs cimport flags_t
|
||||||
|
import orth
|
||||||
|
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||||
|
from .morphology cimport X, PUNCT, EOL
|
||||||
|
|
||||||
|
from .tokens cimport Morphology
|
||||||
|
|
||||||
|
|
||||||
|
POS_TAGS = {
|
||||||
|
'NULL': (NO_TAG, {}),
|
||||||
|
'EOL': (EOL, {}),
|
||||||
|
'CC': (CONJ, {}),
|
||||||
|
'CD': (NUM, {}),
|
||||||
|
'DT': (DET, {}),
|
||||||
|
'EX': (DET, {}),
|
||||||
|
'FW': (X, {}),
|
||||||
|
'IN': (ADP, {}),
|
||||||
|
'JJ': (ADJ, {}),
|
||||||
|
'JJR': (ADJ, {'misc': COMPARATIVE}),
|
||||||
|
'JJS': (ADJ, {'misc': SUPERLATIVE}),
|
||||||
|
'LS': (X, {}),
|
||||||
|
'MD': (VERB, {'tenspect': MODAL}),
|
||||||
|
'NN': (NOUN, {}),
|
||||||
|
'NNS': (NOUN, {'number': PLURAL}),
|
||||||
|
'NNP': (NOUN, {'misc': NAME}),
|
||||||
|
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
|
||||||
|
'PDT': (DET, {}),
|
||||||
|
'POS': (PRT, {'case': GENITIVE}),
|
||||||
|
'PRP': (NOUN, {}),
|
||||||
|
'PRP$': (NOUN, {'case': GENITIVE}),
|
||||||
|
'RB': (ADV, {}),
|
||||||
|
'RBR': (ADV, {'misc': COMPARATIVE}),
|
||||||
|
'RBS': (ADV, {'misc': SUPERLATIVE}),
|
||||||
|
'RP': (PRT, {}),
|
||||||
|
'SYM': (X, {}),
|
||||||
|
'TO': (PRT, {}),
|
||||||
|
'UH': (X, {}),
|
||||||
|
'VB': (VERB, {}),
|
||||||
|
'VBD': (VERB, {'tenspect': PAST}),
|
||||||
|
'VBG': (VERB, {'tenspect': ING}),
|
||||||
|
'VBN': (VERB, {'tenspect': PASSIVE}),
|
||||||
|
'VBP': (VERB, {'tenspect': PRESENT}),
|
||||||
|
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||||
|
'WDT': (DET, {'misc': RELATIVE}),
|
||||||
|
'WP': (PRON, {'misc': RELATIVE}),
|
||||||
|
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
|
||||||
|
'WRB': (ADV, {'misc': RELATIVE}),
|
||||||
|
'!': (PUNCT, {}),
|
||||||
|
'#': (PUNCT, {}),
|
||||||
|
'$': (PUNCT, {}),
|
||||||
|
"''": (PUNCT, {}),
|
||||||
|
"(": (PUNCT, {}),
|
||||||
|
")": (PUNCT, {}),
|
||||||
|
"-LRB-": (PUNCT, {}),
|
||||||
|
"-RRB-": (PUNCT, {}),
|
||||||
|
".": (PUNCT, {}),
|
||||||
|
",": (PUNCT, {}),
|
||||||
|
"``": (PUNCT, {}),
|
||||||
|
":": (PUNCT, {}),
|
||||||
|
"?": (PUNCT, {}),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
POS_TEMPLATES = (
|
||||||
|
(W_sic,),
|
||||||
|
(P1_lemma, P1_pos),
|
||||||
|
(P2_lemma, P2_pos),
|
||||||
|
(N1_sic,),
|
||||||
|
(N2_sic,),
|
||||||
|
|
||||||
|
(W_suffix,),
|
||||||
|
(W_prefix,),
|
||||||
|
|
||||||
|
(P1_pos,),
|
||||||
|
(P2_pos,),
|
||||||
|
(P1_pos, P2_pos),
|
||||||
|
(P1_pos, W_sic),
|
||||||
|
(P1_suffix,),
|
||||||
|
(N1_suffix,),
|
||||||
|
|
||||||
|
(W_shape,),
|
||||||
|
(W_cluster,),
|
||||||
|
(N1_cluster,),
|
||||||
|
(N2_cluster,),
|
||||||
|
(P1_cluster,),
|
||||||
|
(P2_cluster,),
|
||||||
|
|
||||||
|
(W_pos_type,),
|
||||||
|
(N1_pos_type,),
|
||||||
|
(N1_pos_type,),
|
||||||
|
(P1_pos, W_pos_type, N1_pos_type),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
|
@ -47,7 +134,68 @@ cdef class English(Language):
|
||||||
name (unicode): The two letter code used by Wikipedia for the language.
|
name (unicode): The two letter code used by Wikipedia for the language.
|
||||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||||
"""
|
"""
|
||||||
pass
|
def get_props(self, unicode string):
|
||||||
|
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
|
||||||
|
|
||||||
|
def set_flags(self, unicode string):
|
||||||
|
cdef flags_t flags = 0
|
||||||
|
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||||
|
flags |= orth.is_ascii(string) << IS_ASCII
|
||||||
|
flags |= orth.is_digit(string) << IS_DIGIT
|
||||||
|
flags |= orth.is_lower(string) << IS_LOWER
|
||||||
|
flags |= orth.is_punct(string) << IS_PUNCT
|
||||||
|
flags |= orth.is_space(string) << IS_SPACE
|
||||||
|
flags |= orth.is_title(string) << IS_TITLE
|
||||||
|
flags |= orth.is_upper(string) << IS_UPPER
|
||||||
|
|
||||||
|
flags |= orth.like_url(string) << LIKE_URL
|
||||||
|
flags |= orth.like_number(string) << LIKE_NUMBER
|
||||||
|
return flags
|
||||||
|
|
||||||
|
def set_pos(self, Tokens tokens):
|
||||||
|
cdef int i
|
||||||
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
|
cdef TokenC* t = tokens.data
|
||||||
|
assert self.morphologizer is not None
|
||||||
|
cdef dict tagdict = self.pos_tagger.tagdict
|
||||||
|
for i in range(tokens.length):
|
||||||
|
if t[i].lex.sic in tagdict:
|
||||||
|
t[i].pos = tagdict[t[i].lex.sic]
|
||||||
|
else:
|
||||||
|
fill_pos_context(context, i, t)
|
||||||
|
t[i].pos = self.pos_tagger.predict(context)
|
||||||
|
self.morphologizer.set_morph(i, t)
|
||||||
|
|
||||||
|
def train_pos(self, Tokens tokens, golds):
|
||||||
|
cdef int i
|
||||||
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
|
c = 0
|
||||||
|
cdef TokenC* t = tokens.data
|
||||||
|
for i in range(tokens.length):
|
||||||
|
fill_pos_context(context, i, t)
|
||||||
|
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||||
|
self.morphologizer.set_morph(i, t)
|
||||||
|
c += t[i].pos == golds[i]
|
||||||
|
return c
|
||||||
|
|
||||||
|
|
||||||
|
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||||
|
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||||
|
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
||||||
|
_fill_from_token(&context[W_sic], &tokens[i])
|
||||||
|
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
||||||
|
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||||
|
context[0] = t.lex.sic
|
||||||
|
context[1] = t.lex.cluster
|
||||||
|
context[2] = t.lex.shape
|
||||||
|
context[3] = t.lex.prefix
|
||||||
|
context[4] = t.lex.suffix
|
||||||
|
context[5] = t.pos
|
||||||
|
context[6] = t.lemma
|
||||||
|
context[7] = t.lex.pos_type
|
||||||
|
|
||||||
|
|
||||||
EN = English('en')
|
EN = English('en')
|
||||||
|
|
|
@ -1,38 +1,38 @@
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
|
||||||
|
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Tokens, TokenC
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .tagger cimport Tagger
|
from .tagger cimport Tagger
|
||||||
from .ner.greedy_parser cimport NERParser
|
from .utf8string cimport StringStore, UniStr
|
||||||
from .utf8string cimport StringStore
|
from .morphology cimport Morphologizer
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "Python.h":
|
cdef union LexemesOrTokens:
|
||||||
cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
|
const Lexeme* const* lexemes
|
||||||
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
|
TokenC* tokens
|
||||||
cdef bint Py_UNICODE_ISALPHA(Py_UNICODE ch)
|
|
||||||
cdef bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct String:
|
cdef struct Cached:
|
||||||
Py_UNICODE* chars
|
LexemesOrTokens data
|
||||||
size_t n
|
bint is_lex
|
||||||
hash_t key
|
int length
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
cpdef public get_lex_props
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly size_t size
|
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cdef vector[Lexeme*] lexemes
|
cdef vector[Lexeme*] lexemes
|
||||||
|
|
||||||
cdef Lexeme* get(self, String* s) except NULL
|
cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
|
||||||
|
|
||||||
cdef PreshMap _dict
|
cdef PreshMap _map
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
|
@ -41,9 +41,8 @@ cdef class Language:
|
||||||
cdef PreshMap _cache
|
cdef PreshMap _cache
|
||||||
cdef PreshMap _specials
|
cdef PreshMap _specials
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
|
|
||||||
cpdef readonly Tagger pos_tagger
|
cpdef readonly Tagger pos_tagger
|
||||||
cpdef readonly NERParser ner_tagger
|
cpdef readonly Morphologizer morphologizer
|
||||||
|
|
||||||
cdef object _prefix_re
|
cdef object _prefix_re
|
||||||
cdef object _suffix_re
|
cdef object _suffix_re
|
||||||
|
@ -52,13 +51,14 @@ cdef class Language:
|
||||||
cpdef Tokens tokens_from_list(self, list strings)
|
cpdef Tokens tokens_from_list(self, list strings)
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
||||||
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
||||||
|
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
|
||||||
vector[Lexeme*] *suffixes) except NULL
|
vector[Lexeme*] *suffixes) except NULL
|
||||||
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||||
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
|
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
|
||||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
|
||||||
|
|
||||||
|
|
311
spacy/lang.pyx
311
spacy/lang.pyx
|
@ -18,13 +18,14 @@ from preshed.maps cimport PreshMap
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport init as lexeme_init
|
from .lexeme cimport init as lexeme_init
|
||||||
|
from .lexeme cimport check_flag
|
||||||
|
|
||||||
|
from .utf8string cimport slice_unicode
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from .tokens import Tokens
|
from .tokens import Tokens
|
||||||
|
from .tokens cimport Morphology
|
||||||
from .tagger cimport Tagger
|
|
||||||
from .ner.greedy_parser cimport NERParser
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
|
@ -37,29 +38,30 @@ cdef class Language:
|
||||||
self._prefix_re = re.compile(prefix)
|
self._prefix_re = re.compile(prefix)
|
||||||
self._suffix_re = re.compile(suffix)
|
self._suffix_re = re.compile(suffix)
|
||||||
self._infix_re = re.compile(infix)
|
self._infix_re = re.compile(infix)
|
||||||
self.lexicon = Lexicon()
|
self.lexicon = Lexicon(self.get_props)
|
||||||
if path.exists(path.join(util.DATA_DIR, name, 'lexemes')):
|
|
||||||
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
|
||||||
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
if path.exists(path.join(util.DATA_DIR, name, 'pos')):
|
|
||||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
|
|
||||||
else:
|
|
||||||
self.pos_tagger = None
|
self.pos_tagger = None
|
||||||
if path.exists(path.join(util.DATA_DIR, name, 'ner')):
|
self.morphologizer = None
|
||||||
self.ner_tagger = NERParser(path.join(util.DATA_DIR, name, 'ner'))
|
|
||||||
|
def load(self):
|
||||||
|
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
|
||||||
|
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
|
||||||
|
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
||||||
|
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
||||||
|
self.morphologizer = Morphologizer(self.lexicon.strings,
|
||||||
|
path.join(util.DATA_DIR, self.name))
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
cdef Tokens tokens = Tokens(self, length)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return tokens
|
||||||
cdef String string_struct
|
cdef UniStr string_struct
|
||||||
cdef unicode py_string
|
cdef unicode py_string
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
for i, py_string in enumerate(strings):
|
for i, py_string in enumerate(strings):
|
||||||
string_from_unicode(&string_struct, py_string)
|
slice_unicode(&string_struct, py_string, 0, len(py_string))
|
||||||
tokens.push_back(idx, self.lexicon.get(&string_struct))
|
tokens.push_back(idx, <const Lexeme*>self.lexicon.get(tokens.mem, &string_struct))
|
||||||
idx += len(py_string) + 1
|
idx += len(py_string) + 1
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
@ -79,22 +81,21 @@ cdef class Language:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
||||||
"""
|
"""
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
cdef Tokens tokens = Tokens(self, length)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return tokens
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
|
cdef bint cache_hit
|
||||||
cdef Py_UNICODE* chars = string
|
cdef Py_UNICODE* chars = string
|
||||||
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
||||||
cdef String span
|
cdef UniStr span
|
||||||
for i in range(1, length):
|
for i in range(1, length):
|
||||||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||||
if start < i:
|
if start < i:
|
||||||
string_slice(&span, chars, start, i)
|
slice_unicode(&span, chars, start, i)
|
||||||
lexemes = <Lexeme**>self._cache.get(span.key)
|
cache_hit = self._try_cache(start, span.key, tokens)
|
||||||
if lexemes != NULL:
|
if not cache_hit:
|
||||||
tokens.extend(start, lexemes, 0)
|
|
||||||
else:
|
|
||||||
self._tokenize(tokens, &span, start, i)
|
self._tokenize(tokens, &span, start, i)
|
||||||
in_ws = not in_ws
|
in_ws = not in_ws
|
||||||
start = i
|
start = i
|
||||||
|
@ -102,15 +103,27 @@ cdef class Language:
|
||||||
start += 1
|
start += 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
string_slice(&span, chars, start, i)
|
slice_unicode(&span, chars, start, i)
|
||||||
lexemes = <Lexeme**>self._cache.get(span.key)
|
cache_hit = self._try_cache(start, span.key, tokens)
|
||||||
if lexemes != NULL:
|
if not cache_hit:
|
||||||
tokens.extend(start, lexemes, 0)
|
|
||||||
else:
|
|
||||||
self._tokenize(tokens, &span, start, i)
|
self._tokenize(tokens, &span, start, i)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
||||||
|
#cached = <Cached*>self._specials.get(key)
|
||||||
|
cached = <Cached*>self._cache.get(key)
|
||||||
|
if cached == NULL:
|
||||||
|
return False
|
||||||
|
cdef int i
|
||||||
|
if cached.is_lex:
|
||||||
|
for i in range(cached.length):
|
||||||
|
idx = tokens.push_back(idx, cached.data.lexemes[i])
|
||||||
|
else:
|
||||||
|
for i in range(cached.length):
|
||||||
|
idx = tokens.push_back(idx, &cached.data.tokens[i])
|
||||||
|
return True
|
||||||
|
|
||||||
|
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
||||||
cdef vector[Lexeme*] prefixes
|
cdef vector[Lexeme*] prefixes
|
||||||
cdef vector[Lexeme*] suffixes
|
cdef vector[Lexeme*] suffixes
|
||||||
cdef hash_t orig_key
|
cdef hash_t orig_key
|
||||||
|
@ -119,88 +132,95 @@ cdef class Language:
|
||||||
orig_size = tokens.length
|
orig_size = tokens.length
|
||||||
self._split_affixes(span, &prefixes, &suffixes)
|
self._split_affixes(span, &prefixes, &suffixes)
|
||||||
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||||
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
|
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
|
||||||
|
|
||||||
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
|
||||||
vector[Lexeme*] *suffixes) except NULL:
|
vector[const Lexeme*] *suffixes) except NULL:
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef String prefix
|
cdef UniStr prefix
|
||||||
cdef String suffix
|
cdef UniStr suffix
|
||||||
cdef String minus_pre
|
cdef UniStr minus_pre
|
||||||
cdef String minus_suf
|
cdef UniStr minus_suf
|
||||||
cdef size_t last_size = 0
|
cdef size_t last_size = 0
|
||||||
while string.n != 0 and string.n != last_size:
|
while string.n != 0 and string.n != last_size:
|
||||||
last_size = string.n
|
last_size = string.n
|
||||||
pre_len = self._find_prefix(string.chars, string.n)
|
pre_len = self._find_prefix(string.chars, string.n)
|
||||||
if pre_len != 0:
|
if pre_len != 0:
|
||||||
string_slice(&prefix, string.chars, 0, pre_len)
|
slice_unicode(&prefix, string.chars, 0, pre_len)
|
||||||
string_slice(&minus_pre, string.chars, pre_len, string.n)
|
slice_unicode(&minus_pre, string.chars, pre_len, string.n)
|
||||||
# Check whether we've hit a special-case
|
# Check whether we've hit a special-case
|
||||||
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
|
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
|
||||||
string[0] = minus_pre
|
string[0] = minus_pre
|
||||||
prefixes.push_back(self.lexicon.get(&prefix))
|
prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
|
||||||
break
|
break
|
||||||
suf_len = self._find_suffix(string.chars, string.n)
|
suf_len = self._find_suffix(string.chars, string.n)
|
||||||
if suf_len != 0:
|
if suf_len != 0:
|
||||||
string_slice(&suffix, string.chars, string.n - suf_len, string.n)
|
slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
|
||||||
string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||||
# Check whether we've hit a special-case
|
# Check whether we've hit a special-case
|
||||||
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
|
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
|
||||||
string[0] = minus_suf
|
string[0] = minus_suf
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
|
||||||
break
|
break
|
||||||
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
||||||
string_slice(string, string.chars, pre_len, string.n - suf_len)
|
slice_unicode(string, string.chars, pre_len, string.n - suf_len)
|
||||||
prefixes.push_back(self.lexicon.get(&prefix))
|
prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
|
||||||
elif pre_len:
|
elif pre_len:
|
||||||
string[0] = minus_pre
|
string[0] = minus_pre
|
||||||
prefixes.push_back(self.lexicon.get(&prefix))
|
prefixes.push_back(self.lexicon.get(self.lexicon.mem, &prefix))
|
||||||
elif suf_len:
|
elif suf_len:
|
||||||
string[0] = minus_suf
|
string[0] = minus_suf
|
||||||
suffixes.push_back(self.lexicon.get(&suffix))
|
suffixes.push_back(self.lexicon.get(self.lexicon.mem, &suffix))
|
||||||
if self._specials.get(string.key):
|
if self._specials.get(string.key):
|
||||||
break
|
break
|
||||||
return string
|
return string
|
||||||
|
|
||||||
cdef int _attach_tokens(self, Tokens tokens,
|
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||||
int idx, String* string,
|
vector[const Lexeme*] *prefixes,
|
||||||
vector[Lexeme*] *prefixes,
|
vector[const Lexeme*] *suffixes) except -1:
|
||||||
vector[Lexeme*] *suffixes) except -1:
|
cdef bint cache_hit
|
||||||
cdef int split
|
cdef int split
|
||||||
cdef Lexeme** lexemes
|
cdef const Lexeme* const* lexemes
|
||||||
cdef Lexeme* lexeme
|
cdef Lexeme* lexeme
|
||||||
cdef String span
|
cdef UniStr span
|
||||||
|
cdef int i
|
||||||
if prefixes.size():
|
if prefixes.size():
|
||||||
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
for i in range(prefixes.size()):
|
||||||
|
idx = tokens.push_back(idx, prefixes[0][i])
|
||||||
if string.n != 0:
|
if string.n != 0:
|
||||||
|
cache_hit = self._try_cache(idx, string.key, tokens)
|
||||||
lexemes = <Lexeme**>self._cache.get(string.key)
|
if cache_hit:
|
||||||
if lexemes != NULL:
|
idx = tokens.data[tokens.length - 1].idx + 1
|
||||||
idx = tokens.extend(idx, lexemes, 0)
|
|
||||||
else:
|
else:
|
||||||
split = self._find_infix(string.chars, string.n)
|
split = self._find_infix(string.chars, string.n)
|
||||||
if split == 0 or split == -1:
|
if split == 0 or split == -1:
|
||||||
idx = tokens.push_back(idx, self.lexicon.get(string))
|
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, string))
|
||||||
else:
|
else:
|
||||||
string_slice(&span, string.chars, 0, split)
|
slice_unicode(&span, string.chars, 0, split)
|
||||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
|
||||||
string_slice(&span, string.chars, split, split+1)
|
slice_unicode(&span, string.chars, split, split+1)
|
||||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
|
||||||
string_slice(&span, string.chars, split + 1, string.n)
|
slice_unicode(&span, string.chars, split + 1, string.n)
|
||||||
idx = tokens.push_back(idx, self.lexicon.get(&span))
|
idx = tokens.push_back(idx, self.lexicon.get(tokens.mem, &span))
|
||||||
cdef vector[Lexeme*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
idx = tokens.push_back(idx, deref(it))
|
idx = tokens.push_back(idx, deref(it))
|
||||||
preinc(it)
|
preinc(it)
|
||||||
|
|
||||||
cdef int _save_cached(self, Lexeme** tokens, hash_t key, int n) except -1:
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
|
||||||
lexemes = <Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
|
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
lexemes[i] = tokens[i]
|
if tokens[i].lex.id == 1:
|
||||||
lexemes[i + 1] = NULL
|
return 0
|
||||||
self._cache.set(key, lexemes)
|
cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
|
||||||
|
cached.length = n
|
||||||
|
cached.is_lex = True
|
||||||
|
lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
|
||||||
|
for i in range(n):
|
||||||
|
lexemes[i] = tokens[i].lex
|
||||||
|
cached.data.lexemes = <const Lexeme* const*>lexemes
|
||||||
|
self._cache.set(key, cached)
|
||||||
|
|
||||||
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
|
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
cdef unicode string = chars[:length]
|
cdef unicode string = chars[:length]
|
||||||
|
@ -217,66 +237,120 @@ cdef class Language:
|
||||||
match = self._suffix_re.search(string)
|
match = self._suffix_re.search(string)
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def _load_special_tokenization(self, token_rules):
|
def _load_special_tokenization(self, object rules):
|
||||||
'''Load special-case tokenization rules.
|
'''Add a special-case tokenization rule.
|
||||||
|
|
||||||
Loads special-case tokenization rules into the Language._cache cache,
|
|
||||||
read from data/<lang>/tokenization . The special cases are loaded before
|
|
||||||
any language data is tokenized, giving these priority. For instance,
|
|
||||||
the English tokenization rules map "ain't" to ["are", "not"].
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
|
||||||
a string and tokens is a list of strings.
|
|
||||||
'''
|
'''
|
||||||
|
cdef int i
|
||||||
|
cdef unicode chunk
|
||||||
|
cdef list substrings
|
||||||
|
cdef unicode form
|
||||||
|
cdef unicode lemma
|
||||||
|
cdef dict props
|
||||||
cdef Lexeme** lexemes
|
cdef Lexeme** lexemes
|
||||||
cdef hash_t hashed
|
cdef hash_t hashed
|
||||||
cdef String string
|
cdef UniStr string
|
||||||
for uni_string, substrings in token_rules:
|
for chunk, substrings in sorted(rules.items()):
|
||||||
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
|
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||||
for i, substring in enumerate(substrings):
|
for i, props in enumerate(substrings):
|
||||||
string_from_unicode(&string, substring)
|
form = props['F']
|
||||||
lexemes[i] = <Lexeme*>self.lexicon.get(&string)
|
lemma = props.get("L", None)
|
||||||
lexemes[i + 1] = NULL
|
slice_unicode(&string, form, 0, len(form))
|
||||||
string_from_unicode(&string, uni_string)
|
tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
|
||||||
self._specials.set(string.key, lexemes)
|
if lemma:
|
||||||
self._cache.set(string.key, lexemes)
|
tokens[i].lemma = self.lexicon.strings[lemma]
|
||||||
|
set_morph_from_dict(&tokens[i].morph, props)
|
||||||
|
cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
|
||||||
|
cached.length = len(substrings)
|
||||||
|
cached.is_lex = False
|
||||||
|
cached.data.tokens = tokens
|
||||||
|
slice_unicode(&string, chunk, 0, len(chunk))
|
||||||
|
self._specials.set(string.key, cached)
|
||||||
|
self._cache.set(string.key, cached)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||||
|
morph.number = props.get('number', 0)
|
||||||
|
morph.tenspect = props.get('tenspect', 0)
|
||||||
|
morph.mood = props.get('mood', 0)
|
||||||
|
morph.gender = props.get('gender', 0)
|
||||||
|
morph.person = props.get('person', 0)
|
||||||
|
morph.case = props.get('case', 0)
|
||||||
|
morph.misc = props.get('misc', 0)
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
def __init__(self):
|
'''A map container for a language's Lexeme structs.
|
||||||
|
|
||||||
|
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
|
||||||
|
'''
|
||||||
|
def __init__(self, object get_props):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._dict = PreshMap(2 ** 20)
|
self._map = PreshMap(2 ** 20)
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.size = 1
|
self.get_lex_props = get_props
|
||||||
|
|
||||||
cdef Lexeme* get(self, String* string) except NULL:
|
def __len__(self):
|
||||||
|
return self.lexemes.size()
|
||||||
|
|
||||||
|
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||||
|
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
||||||
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
|
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||||
cdef Lexeme* lex
|
cdef Lexeme* lex
|
||||||
lex = <Lexeme*>self._dict.get(string.key)
|
lex = <Lexeme*>self._map.get(string.key)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
lex = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
if string.n < 3:
|
||||||
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key, self.strings, {})
|
mem = self.mem
|
||||||
self._dict.set(string.key, lex)
|
cdef unicode py_string = string.chars[:string.n]
|
||||||
|
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
||||||
|
lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
|
||||||
|
self.get_lex_props(py_string))
|
||||||
|
if mem is self.mem:
|
||||||
|
self._map.set(string.key, lex)
|
||||||
while self.lexemes.size() < (lex.id + 1):
|
while self.lexemes.size() < (lex.id + 1):
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.lexemes[lex.id] = lex
|
self.lexemes[lex.id] = lex
|
||||||
self.size += 1
|
else:
|
||||||
|
lex[0].id = 1
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
def __getitem__(self, id_or_string):
|
def __getitem__(self, id_or_string):
|
||||||
|
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||||
|
unseen unicode string is given, a new Lexeme is created and stored.
|
||||||
|
|
||||||
|
This function relies on Cython's struct-to-dict conversion. Python clients
|
||||||
|
receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
|
||||||
|
with int values. Cython clients can instead receive a Lexeme struct value.
|
||||||
|
More efficient Cython access is provided by Lexicon.get, which returns
|
||||||
|
a Lexeme*.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
||||||
|
string. If an int >= Lexicon.size, IndexError is raised.
|
||||||
|
If id_or_string is neither an int nor a unicode string, ValueError
|
||||||
|
is raised.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
lexeme (dict): A Lexeme struct instance, which Cython translates into
|
||||||
|
a dict if the operator is called from Python.
|
||||||
|
'''
|
||||||
if type(id_or_string) == int:
|
if type(id_or_string) == int:
|
||||||
|
if id_or_string >= self.lexemes.size():
|
||||||
|
raise IndexError
|
||||||
return self.lexemes.at(id_or_string)[0]
|
return self.lexemes.at(id_or_string)[0]
|
||||||
cdef String string
|
cdef UniStr string
|
||||||
string_from_unicode(&string, id_or_string)
|
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||||
cdef Lexeme* lexeme = self.get(&string)
|
cdef const Lexeme* lexeme = self.get(self.mem, &string)
|
||||||
return lexeme[0]
|
return lexeme[0]
|
||||||
|
|
||||||
def __setitem__(self, unicode uni_string, dict props):
|
def __setitem__(self, unicode uni_string, dict props):
|
||||||
cdef String s
|
cdef UniStr s
|
||||||
string_from_unicode(&s, uni_string)
|
slice_unicode(&s, uni_string, 0, len(uni_string))
|
||||||
cdef Lexeme* lex = self.get(&s)
|
# Cast through the const here, since we're allowed to change our own
|
||||||
|
# Lexemes.
|
||||||
|
lex = <Lexeme*><void*>self.get(self.mem, &s)
|
||||||
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
|
@ -287,11 +361,11 @@ cdef class Lexicon:
|
||||||
assert fp != NULL
|
assert fp != NULL
|
||||||
cdef size_t st
|
cdef size_t st
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
for i in range(self._dict.length):
|
for i in range(self._map.length):
|
||||||
key = self._dict.c_map.cells[i].key
|
key = self._map.c_map.cells[i].key
|
||||||
if key == 0:
|
if key == 0:
|
||||||
continue
|
continue
|
||||||
lexeme = <Lexeme*>self._dict.c_map.cells[i].value
|
lexeme = <Lexeme*>self._map.c_map.cells[i].value
|
||||||
st = fwrite(&key, sizeof(key), 1, fp)
|
st = fwrite(&key, sizeof(key), 1, fp)
|
||||||
assert st == 1
|
assert st == 1
|
||||||
st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
|
st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
|
||||||
|
@ -300,7 +374,8 @@ cdef class Lexicon:
|
||||||
assert st == 0
|
assert st == 0
|
||||||
|
|
||||||
def load(self, loc):
|
def load(self, loc):
|
||||||
assert path.exists(loc)
|
if not path.exists(loc):
|
||||||
|
raise IOError('Lexemes file not found at %s' % loc)
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
|
cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
|
||||||
assert fp != NULL
|
assert fp != NULL
|
||||||
|
@ -316,21 +391,9 @@ cdef class Lexicon:
|
||||||
st = fread(lexeme, sizeof(Lexeme), 1, fp)
|
st = fread(lexeme, sizeof(Lexeme), 1, fp)
|
||||||
if st != 1:
|
if st != 1:
|
||||||
break
|
break
|
||||||
self._dict.set(key, lexeme)
|
self._map.set(key, lexeme)
|
||||||
while self.lexemes.size() < (lexeme.id + 1):
|
while self.lexemes.size() < (lexeme.id + 1):
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.lexemes[lexeme.id] = lexeme
|
self.lexemes[lexeme.id] = lexeme
|
||||||
i += 1
|
i += 1
|
||||||
self.size += 1
|
|
||||||
fclose(fp)
|
fclose(fp)
|
||||||
|
|
||||||
|
|
||||||
cdef void string_from_unicode(String* s, unicode uni):
|
|
||||||
cdef Py_UNICODE* c_uni = <Py_UNICODE*>uni
|
|
||||||
string_slice(s, c_uni, 0, len(uni))
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
|
|
||||||
s.chars = &chars[start]
|
|
||||||
s.n = end - start
|
|
||||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
|
||||||
|
|
90
spacy/lemmatizer.py
Normal file
90
spacy/lemmatizer.py
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
|
||||||
|
NOUN_RULES = (
|
||||||
|
('s', ''),
|
||||||
|
('ses', 's'),
|
||||||
|
('ves', 'f'),
|
||||||
|
('xes', 'x'),
|
||||||
|
('zes', 'z'),
|
||||||
|
('ches', 'ch'),
|
||||||
|
('shes', 'sh'),
|
||||||
|
('men', 'man'),
|
||||||
|
('ies', 'y')
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
VERB_RULES = (
|
||||||
|
("s", ""),
|
||||||
|
("ies", "y"),
|
||||||
|
("es", "e"),
|
||||||
|
("es", ""),
|
||||||
|
("ed", "e"),
|
||||||
|
("ed", ""),
|
||||||
|
("ing", "e"),
|
||||||
|
("ing", "")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
ADJ_RULES = (
|
||||||
|
("er", ""),
|
||||||
|
("est", ""),
|
||||||
|
("er", "e"),
|
||||||
|
("est", "e")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Lemmatizer(object):
|
||||||
|
def __init__(self, wn_dict_dir):
|
||||||
|
self.index = {}
|
||||||
|
self.exc = {}
|
||||||
|
for pos in ['adj', 'adv', 'noun', 'verb']:
|
||||||
|
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
|
||||||
|
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
|
||||||
|
|
||||||
|
def noun(self, string):
|
||||||
|
return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
|
||||||
|
|
||||||
|
def verb(self, string):
|
||||||
|
return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
|
||||||
|
|
||||||
|
def adj(self, string):
|
||||||
|
return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
|
||||||
|
|
||||||
|
|
||||||
|
def lemmatize(string, index, exceptions, rules):
|
||||||
|
string = string.lower()
|
||||||
|
forms = []
|
||||||
|
if string in index:
|
||||||
|
forms.append(string)
|
||||||
|
forms.extend(exceptions.get(string, []))
|
||||||
|
for old, new in rules:
|
||||||
|
if string.endswith(old):
|
||||||
|
form = string[:len(string) - len(old)] + new
|
||||||
|
if form in index:
|
||||||
|
forms.append(form)
|
||||||
|
if not forms:
|
||||||
|
forms.append(string)
|
||||||
|
return set(forms)
|
||||||
|
|
||||||
|
|
||||||
|
def read_index(loc):
|
||||||
|
index = set()
|
||||||
|
for line in open(loc):
|
||||||
|
if line.startswith(' '):
|
||||||
|
continue
|
||||||
|
pieces = line.split()
|
||||||
|
word = pieces[0]
|
||||||
|
if word.count('_') == 0:
|
||||||
|
index.add(word)
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def read_exc(loc):
|
||||||
|
exceptions = {}
|
||||||
|
for line in open(loc):
|
||||||
|
if line.startswith(' '):
|
||||||
|
continue
|
||||||
|
pieces = line.split()
|
||||||
|
exceptions[pieces[0]] = tuple(pieces[1:])
|
||||||
|
return exceptions
|
160
spacy/lexeme.pxd
160
spacy/lexeme.pxd
|
@ -1,61 +1,137 @@
|
||||||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t
|
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
|
||||||
|
|
||||||
from .utf8string cimport StringStore
|
from .utf8string cimport StringStore
|
||||||
from libc.stdint cimport uint16_t
|
|
||||||
|
|
||||||
cpdef flag_t OOV_DIST_FLAGS
|
|
||||||
|
|
||||||
# Flags
|
# Reserve 64 values for flag features
|
||||||
cpdef enum:
|
cpdef enum attr_id_t:
|
||||||
IS_ALPHA
|
FLAG0
|
||||||
IS_ASCII
|
FLAG1
|
||||||
IS_DIGIT
|
FLAG2
|
||||||
IS_LOWER
|
FLAG3
|
||||||
IS_PUNCT
|
FLAG4
|
||||||
IS_SPACE
|
FLAG5
|
||||||
IS_TITLE
|
FLAG6
|
||||||
IS_UPPER
|
FLAG7
|
||||||
|
FLAG8
|
||||||
|
FLAG9
|
||||||
|
FLAG10
|
||||||
|
FLAG11
|
||||||
|
FLAG12
|
||||||
|
FLAG13
|
||||||
|
FLAG14
|
||||||
|
FLAG15
|
||||||
|
FLAG16
|
||||||
|
FLAG17
|
||||||
|
FLAG18
|
||||||
|
FLAG19
|
||||||
|
FLAG20
|
||||||
|
FLAG21
|
||||||
|
FLAG22
|
||||||
|
FLAG23
|
||||||
|
FLAG24
|
||||||
|
FLAG25
|
||||||
|
FLAG26
|
||||||
|
FLAG27
|
||||||
|
FLAG28
|
||||||
|
FLAG29
|
||||||
|
FLAG30
|
||||||
|
FLAG31
|
||||||
|
FLAG32
|
||||||
|
FLAG33
|
||||||
|
FLAG34
|
||||||
|
FLAG35
|
||||||
|
FLAG36
|
||||||
|
FLAG37
|
||||||
|
FLAG38
|
||||||
|
FLAG39
|
||||||
|
FLAG40
|
||||||
|
FLAG41
|
||||||
|
FLAG42
|
||||||
|
FLAG43
|
||||||
|
FLAG44
|
||||||
|
FLAG45
|
||||||
|
FLAG46
|
||||||
|
FLAG47
|
||||||
|
FLAG48
|
||||||
|
FLAG49
|
||||||
|
FLAG50
|
||||||
|
FLAG51
|
||||||
|
FLAG52
|
||||||
|
FLAG53
|
||||||
|
FLAG54
|
||||||
|
FLAG55
|
||||||
|
FLAG56
|
||||||
|
FLAG57
|
||||||
|
FLAG58
|
||||||
|
FLAG59
|
||||||
|
FLAG60
|
||||||
|
FLAG61
|
||||||
|
FLAG62
|
||||||
|
FLAG63
|
||||||
|
|
||||||
LIKE_URL
|
ID
|
||||||
LIKE_NUMBER
|
SIC
|
||||||
|
DENSE
|
||||||
|
SHAPE
|
||||||
|
PREFIX
|
||||||
|
SUFFIX
|
||||||
|
|
||||||
OFT_LOWER
|
LENGTH
|
||||||
OFT_TITLE
|
CLUSTER
|
||||||
OFT_UPPER
|
POS_TYPE
|
||||||
|
LEMMA
|
||||||
IN_MALES
|
|
||||||
IN_FEMALES
|
|
||||||
IN_SURNAMES
|
|
||||||
IN_PLACES
|
|
||||||
IN_GAMES
|
|
||||||
IN_CELEBS
|
|
||||||
IN_NAMES
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
flag_t flags
|
flags_t flags
|
||||||
|
|
||||||
id_t id
|
attr_t id
|
||||||
id_t sic
|
attr_t sic
|
||||||
id_t norm
|
attr_t dense
|
||||||
id_t shape
|
attr_t shape
|
||||||
id_t asciied
|
attr_t prefix
|
||||||
id_t prefix
|
attr_t suffix
|
||||||
id_t suffix
|
|
||||||
|
attr_t length
|
||||||
|
attr_t cluster
|
||||||
|
attr_t pos_type
|
||||||
|
|
||||||
float prob
|
float prob
|
||||||
|
float sentiment
|
||||||
len_t length
|
|
||||||
tag_t cluster
|
|
||||||
tag_t postype
|
|
||||||
tag_t supersense
|
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme EMPTY_LEXEME
|
cdef Lexeme EMPTY_LEXEME
|
||||||
|
|
||||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
|
||||||
StringStore store, dict props) except *
|
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
|
||||||
|
dict props) except *
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint check_flag(Lexeme* lexeme, size_t flag_id) nogil:
|
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
|
||||||
return lexeme.flags & (1 << flag_id)
|
return lexeme.flags & (1 << flag_id)
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
||||||
|
if feat_name < (sizeof(flags_t) * 8):
|
||||||
|
return check_flag(lex, feat_name)
|
||||||
|
elif feat_name == ID:
|
||||||
|
return lex.id
|
||||||
|
elif feat_name == SIC:
|
||||||
|
return lex.sic
|
||||||
|
elif feat_name == DENSE:
|
||||||
|
return lex.dense
|
||||||
|
elif feat_name == SHAPE:
|
||||||
|
return lex.shape
|
||||||
|
elif feat_name == PREFIX:
|
||||||
|
return lex.prefix
|
||||||
|
elif feat_name == SUFFIX:
|
||||||
|
return lex.suffix
|
||||||
|
elif feat_name == LENGTH:
|
||||||
|
return lex.length
|
||||||
|
elif feat_name == CLUSTER:
|
||||||
|
return lex.cluster
|
||||||
|
elif feat_name == POS_TYPE:
|
||||||
|
return lex.pos_type
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
|
@ -6,67 +6,25 @@ from libc.string cimport memset
|
||||||
|
|
||||||
import orth
|
import orth
|
||||||
|
|
||||||
from .utf8string cimport Utf8Str
|
|
||||||
|
|
||||||
OOV_DIST_FLAGS = 0
|
|
||||||
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
||||||
|
|
||||||
|
|
||||||
def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
|
|
||||||
cdef flag_t flags = 0
|
|
||||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
|
||||||
flags |= orth.is_ascii(string) << IS_ASCII
|
|
||||||
flags |= orth.is_digit(string) << IS_DIGIT
|
|
||||||
flags |= orth.is_lower(string) << IS_LOWER
|
|
||||||
flags |= orth.is_punct(string) << IS_PUNCT
|
|
||||||
flags |= orth.is_space(string) << IS_SPACE
|
|
||||||
flags |= orth.is_title(string) << IS_TITLE
|
|
||||||
flags |= orth.is_upper(string) << IS_UPPER
|
|
||||||
|
|
||||||
flags |= orth.like_url(string) << LIKE_URL
|
|
||||||
flags |= orth.like_number(string) << LIKE_NUMBER
|
|
||||||
return flags
|
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
||||||
StringStore store, dict props) except *:
|
StringStore string_store, dict props) except *:
|
||||||
cdef Lexeme lex
|
cdef Lexeme lex
|
||||||
lex.id = i
|
lex.id = i
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
lex.sic = get_string_id(string, store)
|
lex.sic = string_store[string]
|
||||||
|
|
||||||
lex.cluster = props.get('cluster', 0)
|
lex.cluster = props.get('cluster', 0)
|
||||||
lex.postype = props.get('postype', 0)
|
lex.pos_type = props.get('pos_type', 0)
|
||||||
lex.supersense = props.get('supersense', 0)
|
|
||||||
lex.prob = props.get('prob', 0)
|
lex.prob = props.get('prob', 0)
|
||||||
|
|
||||||
cdef float upper_pc = props.get('upper_pc', 0.0)
|
lex.prefix = string_store[string[:1]]
|
||||||
cdef float lower_pc = props.get('lower_pc', 0.0)
|
lex.suffix = string_store[string[-3:]]
|
||||||
cdef float title_pc = props.get('title_pc', 0.0)
|
lex.shape = string_store[orth.word_shape(string)]
|
||||||
|
lex.dense = string_store[props['dense']]
|
||||||
|
|
||||||
lex.prefix = get_string_id(string[0], store)
|
lex.flags = props.get('flags', 0)
|
||||||
lex.suffix = get_string_id(string[-3:], store)
|
|
||||||
if upper_pc or lower_pc or title_pc:
|
|
||||||
canon_cased = orth.canon_case(string, upper_pc, title_pc, lower_pc)
|
|
||||||
lex.norm = get_string_id(canon_cased, store)
|
|
||||||
else:
|
|
||||||
lex.norm = lex.sic
|
|
||||||
lex.shape = get_string_id(orth.word_shape(string), store)
|
|
||||||
lex.asciied = get_string_id(orth.asciied(string), store)
|
|
||||||
lex.flags = get_flags(string, upper_pc, title_pc, lower_pc)
|
|
||||||
|
|
||||||
lex.flags |= props.get('in_males', 0) << IN_MALES
|
|
||||||
lex.flags |= props.get('in_females', 0) << IN_FEMALES
|
|
||||||
lex.flags |= props.get('in_surnames', 0) << IN_SURNAMES
|
|
||||||
lex.flags |= props.get('in_places', 0) << IN_PLACES
|
|
||||||
lex.flags |= props.get('in_celebs', 0) << IN_CELEBS
|
|
||||||
lex.flags |= props.get('in_games', 0) << IN_GAMES
|
|
||||||
lex.flags |= props.get('in_names', 0) << IN_NAMES
|
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
|
|
||||||
cdef id_t get_string_id(unicode string, StringStore store) except 0:
|
|
||||||
cdef bytes byte_string = string.encode('utf8')
|
|
||||||
cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string))
|
|
||||||
return orig_str.i
|
|
||||||
|
|
45
spacy/morphology.pxd
Normal file
45
spacy/morphology.pxd
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
|
||||||
|
from .tokens cimport TokenC
|
||||||
|
from .lexeme cimport Lexeme
|
||||||
|
from .utf8string cimport StringStore
|
||||||
|
from .typedefs cimport id_t, Morphology
|
||||||
|
|
||||||
|
from preshed.maps cimport PreshMapArray
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
|
||||||
|
# Google universal tag set
|
||||||
|
cpdef enum univ_tag_t:
|
||||||
|
NO_TAG
|
||||||
|
ADJ
|
||||||
|
ADV
|
||||||
|
ADP
|
||||||
|
CONJ
|
||||||
|
DET
|
||||||
|
NOUN
|
||||||
|
NUM
|
||||||
|
PRON
|
||||||
|
PRT
|
||||||
|
VERB
|
||||||
|
X
|
||||||
|
PUNCT
|
||||||
|
EOL
|
||||||
|
N_UNIV_TAGS
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct PosTag:
|
||||||
|
Morphology morph
|
||||||
|
int id
|
||||||
|
univ_tag_t pos
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Morphologizer:
|
||||||
|
cdef Pool mem
|
||||||
|
cdef StringStore strings
|
||||||
|
cdef object lemmatizer
|
||||||
|
cdef PosTag* tags
|
||||||
|
cdef readonly list tag_names
|
||||||
|
|
||||||
|
cdef PreshMapArray _cache
|
||||||
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||||
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
117
spacy/morphology.pyx
Normal file
117
spacy/morphology.pyx
Normal file
|
@ -0,0 +1,117 @@
|
||||||
|
# cython: profile=True
|
||||||
|
# cython: embedsignature=True
|
||||||
|
from os import path
|
||||||
|
import json
|
||||||
|
|
||||||
|
from .lemmatizer import Lemmatizer
|
||||||
|
from .typedefs cimport id_t
|
||||||
|
|
||||||
|
UNIV_TAGS = {
|
||||||
|
'NULL': NO_TAG,
|
||||||
|
'ADJ': ADJ,
|
||||||
|
'ADV': ADV,
|
||||||
|
'ADP': ADP,
|
||||||
|
'CONJ': CONJ,
|
||||||
|
'DET': DET,
|
||||||
|
'NOUN': NOUN,
|
||||||
|
'NUM': NUM,
|
||||||
|
'PRON': PRON,
|
||||||
|
'PRT': PRT,
|
||||||
|
'VERB': VERB,
|
||||||
|
'X': X,
|
||||||
|
'.': PUNCT,
|
||||||
|
'EOL': EOL
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct _Cached:
|
||||||
|
Morphology morph
|
||||||
|
int lemma
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Morphologizer:
|
||||||
|
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||||
|
"""
|
||||||
|
def __init__(self, StringStore strings, data_dir):
|
||||||
|
self.mem = Pool()
|
||||||
|
self.strings = strings
|
||||||
|
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||||
|
tag_map = cfg['tag_map']
|
||||||
|
self.tag_names = cfg['tag_names']
|
||||||
|
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
|
||||||
|
self._cache = PreshMapArray(len(self.tag_names))
|
||||||
|
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
||||||
|
for i, tag in enumerate(self.tag_names):
|
||||||
|
pos, props = tag_map[tag]
|
||||||
|
self.tags[i].id = i
|
||||||
|
self.tags[i].pos = pos
|
||||||
|
self.tags[i].morph.number = props.get('number', 0)
|
||||||
|
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
||||||
|
self.tags[i].morph.mood = props.get('mood', 0)
|
||||||
|
self.tags[i].morph.gender = props.get('gender', 0)
|
||||||
|
self.tags[i].morph.person = props.get('person', 0)
|
||||||
|
self.tags[i].morph.case = props.get('case', 0)
|
||||||
|
self.tags[i].morph.misc = props.get('misc', 0)
|
||||||
|
if path.exists(path.join(data_dir, 'morphs.json')):
|
||||||
|
with open(path.join(data_dir, 'morphs.json')) as file_:
|
||||||
|
self.load_exceptions(json.load(file_))
|
||||||
|
|
||||||
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||||
|
if self.lemmatizer is None:
|
||||||
|
return lex.sic
|
||||||
|
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||||
|
return lex.sic
|
||||||
|
cdef bytes py_string = self.strings[lex.sic]
|
||||||
|
cdef set lemma_strings
|
||||||
|
cdef bytes lemma_string
|
||||||
|
if pos == NOUN:
|
||||||
|
lemma_strings = self.lemmatizer.noun(py_string)
|
||||||
|
elif pos == VERB:
|
||||||
|
lemma_strings = self.lemmatizer.verb(py_string)
|
||||||
|
else:
|
||||||
|
assert pos == ADJ
|
||||||
|
lemma_strings = self.lemmatizer.adj(py_string)
|
||||||
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
|
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
||||||
|
return lemma
|
||||||
|
|
||||||
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||||
|
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
||||||
|
cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
|
||||||
|
if cached is NULL:
|
||||||
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
|
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||||
|
cached.morph = tag.morph
|
||||||
|
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
||||||
|
|
||||||
|
tokens[i].lemma = cached.lemma
|
||||||
|
tokens[i].morph = cached.morph
|
||||||
|
|
||||||
|
def load_exceptions(self, dict exc):
|
||||||
|
cdef unicode pos_str
|
||||||
|
cdef unicode form_str
|
||||||
|
cdef unicode lemma_str
|
||||||
|
cdef dict entries
|
||||||
|
cdef dict props
|
||||||
|
cdef int lemma
|
||||||
|
cdef id_t sic
|
||||||
|
cdef univ_tag_t pos
|
||||||
|
for pos_str, entries in exc.items():
|
||||||
|
pos = self.tag_names.index(pos_str)
|
||||||
|
for form_str, props in entries.items():
|
||||||
|
lemma_str = props.get('L', form_str)
|
||||||
|
sic = self.strings[form_str]
|
||||||
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
|
cached.lemma = self.strings[lemma_str]
|
||||||
|
set_morph_from_dict(&cached.morph, props)
|
||||||
|
self._cache.set(pos, sic, <void*>cached)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||||
|
morph.number = props.get('number', 0)
|
||||||
|
morph.tenspect = props.get('tenspect', 0)
|
||||||
|
morph.mood = props.get('mood', 0)
|
||||||
|
morph.gender = props.get('gender', 0)
|
||||||
|
morph.person = props.get('person', 0)
|
||||||
|
morph.case = props.get('case', 0)
|
||||||
|
morph.misc = props.get('misc', 0)
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from unidecode import unidecode
|
from unidecode import unidecode
|
||||||
|
import re
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
|
|
|
@ -147,6 +147,7 @@ Y PRT
|
||||||
Z NOUN
|
Z NOUN
|
||||||
^ NOUN
|
^ NOUN
|
||||||
~ X
|
~ X
|
||||||
`` .""".strip().split('\n'))
|
`` .
|
||||||
|
EOL EOL""".strip().split('\n'))
|
||||||
return mapping[tag]
|
return mapping[tag]
|
||||||
|
|
||||||
|
|
|
@ -1,34 +1,23 @@
|
||||||
|
from libc.stdint cimport uint8_t
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from thinc.learner cimport LinearModel
|
from thinc.learner cimport LinearModel
|
||||||
from thinc.features cimport Extractor
|
from thinc.features cimport Extractor
|
||||||
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from preshed.maps cimport PreshMapArray
|
||||||
from .context cimport Slots
|
|
||||||
|
from .typedefs cimport hash_t, id_t
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
cpdef enum TagType:
|
|
||||||
POS
|
|
||||||
ENTITY
|
|
||||||
SENSE
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
cpdef int set_tags(self, Tokens tokens) except -1
|
cdef class_t predict(self, const atom_t* context, object golds=*) except *
|
||||||
cpdef class_t predict(self, int i, Tokens tokens) except 0
|
|
||||||
cpdef int tell_answer(self, list gold) except -1
|
|
||||||
|
|
||||||
cpdef readonly Pool mem
|
cpdef readonly Pool mem
|
||||||
cpdef readonly Extractor extractor
|
cpdef readonly Extractor extractor
|
||||||
cpdef readonly LinearModel model
|
cpdef readonly LinearModel model
|
||||||
|
|
||||||
cpdef readonly TagType tag_type
|
|
||||||
cpdef readonly list tag_names
|
cpdef readonly list tag_names
|
||||||
|
cdef dict tagdict
|
||||||
cdef class_t _guess
|
|
||||||
cdef atom_t* _context
|
|
||||||
cdef feat_t* _feats
|
|
||||||
cdef weight_t* _values
|
|
||||||
cdef weight_t* _scores
|
|
||||||
|
|
183
spacy/tagger.pyx
183
spacy/tagger.pyx
|
@ -1,5 +1,4 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
from __future__ import print_function
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
|
@ -10,155 +9,59 @@ import random
|
||||||
import json
|
import json
|
||||||
import cython
|
import cython
|
||||||
|
|
||||||
|
from thinc.features cimport Feature, count_feats
|
||||||
from .context cimport fill_context
|
|
||||||
from .context cimport N_FIELDS
|
|
||||||
|
|
||||||
from thinc.features cimport ConjFeat
|
|
||||||
|
|
||||||
|
|
||||||
NULL_TAG = 0
|
def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
|
||||||
|
|
||||||
|
|
||||||
def setup_model_dir(tag_type, tag_names, templates, model_dir):
|
|
||||||
if path.exists(model_dir):
|
if path.exists(model_dir):
|
||||||
shutil.rmtree(model_dir)
|
shutil.rmtree(model_dir)
|
||||||
os.mkdir(model_dir)
|
os.mkdir(model_dir)
|
||||||
config = {
|
config = {
|
||||||
'tag_type': tag_type,
|
|
||||||
'templates': templates,
|
'templates': templates,
|
||||||
'tag_names': tag_names,
|
'tag_names': tag_names,
|
||||||
|
'tag_map': tag_map,
|
||||||
|
'tag_counts': tag_counts,
|
||||||
}
|
}
|
||||||
with open(path.join(model_dir, 'config.json'), 'w') as file_:
|
with open(path.join(model_dir, 'config.json'), 'w') as file_:
|
||||||
json.dump(config, file_)
|
json.dump(config, file_)
|
||||||
|
|
||||||
|
|
||||||
def train(train_sents, model_dir, nr_iter=10):
|
|
||||||
cdef Tokens tokens
|
|
||||||
tagger = Tagger(model_dir)
|
|
||||||
for _ in range(nr_iter):
|
|
||||||
n_corr = 0
|
|
||||||
total = 0
|
|
||||||
for tokens, golds in train_sents:
|
|
||||||
assert len(tokens) == len(golds), [t.string for t in tokens]
|
|
||||||
for i in range(tokens.length):
|
|
||||||
if tagger.tag_type == POS:
|
|
||||||
gold = _get_gold_pos(i, golds, tokens.pos)
|
|
||||||
elif tagger.tag_type == ENTITY:
|
|
||||||
gold = _get_gold_ner(i, golds, tokens.ner)
|
|
||||||
guess = tagger.predict(i, tokens)
|
|
||||||
tokens.set_tag(i, tagger.tag_type, guess)
|
|
||||||
if gold is not None:
|
|
||||||
tagger.tell_answer(gold)
|
|
||||||
total += 1
|
|
||||||
n_corr += guess in gold
|
|
||||||
#print('%s\t%d\t%d' % (tokens[i].string, guess, gold))
|
|
||||||
print('%.4f' % ((n_corr / total) * 100))
|
|
||||||
random.shuffle(train_sents)
|
|
||||||
tagger.model.end_training()
|
|
||||||
tagger.model.dump(path.join(model_dir, 'model'))
|
|
||||||
|
|
||||||
|
|
||||||
cdef object _get_gold_pos(i, golds, int* pred):
|
|
||||||
if golds[i] == 0:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return [golds[i]]
|
|
||||||
|
|
||||||
|
|
||||||
cdef object _get_gold_ner(i, golds, int* ner):
|
|
||||||
if golds[i] == 0:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return [golds[i]]
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(tagger, sents):
|
|
||||||
n_corr = 0
|
|
||||||
total = 0
|
|
||||||
for tokens, golds in sents:
|
|
||||||
for i, gold in enumerate(golds):
|
|
||||||
guess = tagger.predict(i, tokens)
|
|
||||||
tokens.set_tag(i, tagger.tag_type, guess)
|
|
||||||
if gold != NULL_TAG:
|
|
||||||
total += 1
|
|
||||||
n_corr += guess == gold
|
|
||||||
return n_corr / total
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
"""Assign part-of-speech, named entity or supersense tags, using greedy
|
"""Predict some type of tag, using greedy decoding. The tagger reads its
|
||||||
decoding. The tagger reads its model and configuration from disk.
|
model and configuration from disk.
|
||||||
"""
|
"""
|
||||||
def __init__(self, model_dir):
|
def __init__(self, model_dir):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
||||||
templates = cfg['templates']
|
templates = cfg['templates']
|
||||||
|
univ_counts = {}
|
||||||
|
cdef unicode tag
|
||||||
|
cdef unicode univ_tag
|
||||||
self.tag_names = cfg['tag_names']
|
self.tag_names = cfg['tag_names']
|
||||||
self.tag_type = cfg['tag_type']
|
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
||||||
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
|
self.extractor = Extractor(templates)
|
||||||
self.model = LinearModel(len(self.tag_names))
|
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
||||||
if path.exists(path.join(model_dir, 'model')):
|
if path.exists(path.join(model_dir, 'model')):
|
||||||
self.model.load(path.join(model_dir, 'model'))
|
self.model.load(path.join(model_dir, 'model'))
|
||||||
|
|
||||||
self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
|
cdef class_t predict(self, atom_t* context, object golds=None) except *:
|
||||||
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
|
"""Predict the tag of tokens[i].
|
||||||
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
|
|
||||||
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
|
|
||||||
self._guess = NULL_TAG
|
|
||||||
|
|
||||||
cpdef int set_tags(self, Tokens tokens) except -1:
|
|
||||||
"""Assign tags to a Tokens object.
|
|
||||||
|
|
||||||
>>> tokens = EN.tokenize(u'An example sentence.')
|
|
||||||
>>> assert tokens[0].pos == 'NO_TAG'
|
|
||||||
>>> EN.pos_tagger.set_tags(tokens)
|
|
||||||
>>> assert tokens[0].pos == 'DT'
|
|
||||||
"""
|
|
||||||
cdef int i
|
|
||||||
for i in range(tokens.length):
|
|
||||||
tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
|
|
||||||
|
|
||||||
cpdef class_t predict(self, int i, Tokens tokens) except 0:
|
|
||||||
"""Predict the tag of tokens[i]. The tagger remembers the features and
|
|
||||||
prediction, in case you later call tell_answer.
|
|
||||||
|
|
||||||
>>> tokens = EN.tokenize(u'An example sentence.')
|
>>> tokens = EN.tokenize(u'An example sentence.')
|
||||||
>>> tag = EN.pos_tagger.predict(0, tokens)
|
>>> tag = EN.pos_tagger.predict(0, tokens)
|
||||||
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
|
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
|
||||||
"""
|
"""
|
||||||
fill_context(self._context, i, tokens)
|
cdef int n_feats
|
||||||
self.extractor.extract(self._feats, self._values, self._context, NULL)
|
cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
|
||||||
self._guess = self.model.score(self._scores, self._feats, self._values)
|
cdef weight_t* scores = self.model.get_scores(feats, n_feats)
|
||||||
return self._guess
|
guess = _arg_max(scores, self.model.nr_class)
|
||||||
|
if golds is not None and guess not in golds:
|
||||||
cpdef int tell_answer(self, list golds) except -1:
|
best = _arg_max_among(scores, golds)
|
||||||
"""Provide the correct tag for the word the tagger was last asked to predict.
|
counts = {guess: {}, best: {}}
|
||||||
During Tagger.predict, the tagger remembers the features and prediction
|
count_feats(counts[guess], feats, n_feats, -1)
|
||||||
for the example. These are used to calculate a weight update given the
|
count_feats(counts[best], feats, n_feats, 1)
|
||||||
correct label.
|
|
||||||
|
|
||||||
>>> tokens = EN.tokenize('An example sentence.')
|
|
||||||
>>> guess = EN.pos_tagger.predict(1, tokens)
|
|
||||||
>>> JJ = EN.pos_tagger.tag_id('JJ')
|
|
||||||
>>> JJ
|
|
||||||
7
|
|
||||||
>>> EN.pos_tagger.tell_answer(JJ)
|
|
||||||
"""
|
|
||||||
cdef class_t guess = self._guess
|
|
||||||
if guess in golds:
|
|
||||||
self.model.update({})
|
|
||||||
return 0
|
|
||||||
best_gold = golds[0]
|
|
||||||
best_score = self._scores[best_gold-1]
|
|
||||||
for gold in golds[1:]:
|
|
||||||
if self._scores[gold-1] > best_gold:
|
|
||||||
best_score = self._scores[best_gold-1]
|
|
||||||
best_gold = gold
|
|
||||||
counts = {guess: {}, best_gold: {}}
|
|
||||||
self.extractor.count(counts[best_gold], self._feats, 1)
|
|
||||||
self.extractor.count(counts[guess], self._feats, -1)
|
|
||||||
self.model.update(counts)
|
self.model.update(counts)
|
||||||
|
return guess
|
||||||
|
|
||||||
def tag_id(self, object tag_name):
|
def tag_id(self, object tag_name):
|
||||||
"""Encode tag_name into a tag ID integer."""
|
"""Encode tag_name into a tag ID integer."""
|
||||||
|
@ -167,3 +70,41 @@ cdef class Tagger:
|
||||||
tag_id = len(self.tag_names)
|
tag_id = len(self.tag_names)
|
||||||
self.tag_names.append(tag_name)
|
self.tag_names.append(tag_name)
|
||||||
return tag_id
|
return tag_id
|
||||||
|
|
||||||
|
|
||||||
|
def _make_tag_dict(counts):
|
||||||
|
freq_thresh = 20
|
||||||
|
ambiguity_thresh = 0.97
|
||||||
|
tagdict = {}
|
||||||
|
cdef atom_t word
|
||||||
|
cdef atom_t tag
|
||||||
|
for word_str, tag_freqs in counts.items():
|
||||||
|
tag_str, mode = max(tag_freqs.items(), key=lambda item: item[1])
|
||||||
|
n = sum(tag_freqs.values())
|
||||||
|
word = int(word_str)
|
||||||
|
tag = int(tag_str)
|
||||||
|
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
|
||||||
|
tagdict[word] = tag
|
||||||
|
return tagdict
|
||||||
|
|
||||||
|
|
||||||
|
cdef class_t _arg_max(weight_t* scores, int n_classes) except 9000:
|
||||||
|
cdef int best = 0
|
||||||
|
cdef weight_t score = scores[best]
|
||||||
|
cdef int i
|
||||||
|
for i in range(1, n_classes):
|
||||||
|
if scores[i] >= score:
|
||||||
|
score = scores[i]
|
||||||
|
best = i
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
cdef class_t _arg_max_among(weight_t* scores, list classes):
|
||||||
|
cdef int best = classes[0]
|
||||||
|
cdef weight_t score = scores[best]
|
||||||
|
cdef class_t clas
|
||||||
|
for clas in classes:
|
||||||
|
if scores[clas] > score:
|
||||||
|
score = scores[clas]
|
||||||
|
best = clas
|
||||||
|
return best
|
||||||
|
|
|
@ -1,40 +1,55 @@
|
||||||
|
import numpy as np
|
||||||
|
cimport numpy as np
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
from thinc.typedefs cimport atom_t
|
||||||
|
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .typedefs cimport flag_t
|
|
||||||
from .utf8string cimport StringStore
|
|
||||||
from .tagger cimport TagType
|
|
||||||
|
|
||||||
from thinc.typedefs cimport atom_t
|
from .typedefs cimport flags_t
|
||||||
|
from .typedefs cimport Morphology
|
||||||
|
from .lang cimport Language
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct TokenC:
|
||||||
|
const Lexeme* lex
|
||||||
|
Morphology morph
|
||||||
|
int idx
|
||||||
|
int pos
|
||||||
|
int lemma
|
||||||
|
int sense
|
||||||
|
|
||||||
|
|
||||||
|
ctypedef const Lexeme* const_Lexeme_ptr
|
||||||
|
ctypedef TokenC* TokenC_ptr
|
||||||
|
|
||||||
|
ctypedef fused LexemeOrToken:
|
||||||
|
const_Lexeme_ptr
|
||||||
|
TokenC_ptr
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef StringStore _string_store
|
cdef Language lang
|
||||||
|
cdef list tag_names
|
||||||
|
|
||||||
cdef Lexeme** _lex_ptr
|
cdef TokenC* data
|
||||||
cdef int* _idx_ptr
|
|
||||||
cdef int* _pos_ptr
|
|
||||||
cdef int* _ner_ptr
|
|
||||||
cdef Lexeme** lex
|
|
||||||
cdef int* idx
|
|
||||||
cdef int* pos
|
|
||||||
cdef int* ner
|
|
||||||
|
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
|
||||||
cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
|
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
||||||
cdef int push_back(self, int i, Lexeme* lexeme) except -1
|
|
||||||
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1
|
cpdef np.ndarray[long, ndim=2] get_array(self, list features)
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef StringStore _string_store
|
cdef public Language lang
|
||||||
cdef public int i
|
cdef public int i
|
||||||
cdef public int idx
|
cdef public int idx
|
||||||
cdef public int pos
|
cdef int pos
|
||||||
cdef public int ner
|
cdef int lemma
|
||||||
|
|
||||||
cdef public atom_t id
|
cdef public atom_t id
|
||||||
cdef public atom_t cluster
|
cdef public atom_t cluster
|
||||||
|
@ -51,4 +66,4 @@ cdef class Token:
|
||||||
|
|
||||||
cdef public float prob
|
cdef public float prob
|
||||||
|
|
||||||
cdef public flag_t flags
|
cdef public flags_t flags
|
||||||
|
|
151
spacy/tokens.pyx
151
spacy/tokens.pyx
|
@ -1,7 +1,15 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
from preshed.counter cimport PreshCounter
|
||||||
|
|
||||||
from .lexeme cimport *
|
from .lexeme cimport *
|
||||||
cimport cython
|
cimport cython
|
||||||
from .tagger cimport POS, ENTITY
|
|
||||||
|
import numpy as np
|
||||||
|
cimport numpy as np
|
||||||
|
|
||||||
|
POS = 0
|
||||||
|
ENTITY = 0
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
@ -17,23 +25,13 @@ cdef class Tokens:
|
||||||
"""A sequence of references to Lexeme objects.
|
"""A sequence of references to Lexeme objects.
|
||||||
|
|
||||||
The Tokens class provides fast and memory-efficient access to lexical features,
|
The Tokens class provides fast and memory-efficient access to lexical features,
|
||||||
and can efficiently export the data to a numpy array. Specific languages
|
and can efficiently export the data to a numpy array.
|
||||||
create their own Tokens subclasses, to provide more convenient access to
|
|
||||||
language-specific features.
|
|
||||||
|
|
||||||
>>> from spacy.en import EN
|
>>> from spacy.en import EN
|
||||||
>>> tokens = EN.tokenize('An example sentence.')
|
>>> tokens = EN.tokenize('An example sentence.')
|
||||||
>>> tokens.string(0)
|
|
||||||
'An'
|
|
||||||
>>> tokens.prob(0) > tokens.prob(1)
|
|
||||||
True
|
|
||||||
>>> tokens.can_noun(0)
|
|
||||||
False
|
|
||||||
>>> tokens.can_noun(1)
|
|
||||||
True
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, StringStore string_store, string_length=0):
|
def __init__(self, Language lang, string_length=0):
|
||||||
self._string_store = string_store
|
self.lang = lang
|
||||||
if string_length >= 3:
|
if string_length >= 3:
|
||||||
size = int(string_length / 3.0)
|
size = int(string_length / 3.0)
|
||||||
else:
|
else:
|
||||||
|
@ -42,28 +40,18 @@ cdef class Tokens:
|
||||||
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
||||||
# However, we need to remember the true starting places, so that we can
|
# However, we need to remember the true starting places, so that we can
|
||||||
# realloc.
|
# realloc.
|
||||||
self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
|
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
||||||
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
|
||||||
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
|
||||||
self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
|
||||||
self.lex = self._lex_ptr
|
|
||||||
self.idx = self._idx_ptr
|
|
||||||
self.pos = self._pos_ptr
|
|
||||||
self.ner = self._ner_ptr
|
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(size + (PADDING*2)):
|
for i in range(size + (PADDING*2)):
|
||||||
self.lex[i] = &EMPTY_LEXEME
|
data_start[i].lex = &EMPTY_LEXEME
|
||||||
self.lex += PADDING
|
self.data = data_start + PADDING
|
||||||
self.idx += PADDING
|
|
||||||
self.pos += PADDING
|
|
||||||
self.ner += PADDING
|
|
||||||
self.max_length = size
|
self.max_length = size
|
||||||
self.length = 0
|
self.length = 0
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
|
return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
|
||||||
self.lex[i][0])
|
self.data[i].lemma, self.data[i].lex[0])
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
|
@ -72,70 +60,78 @@ cdef class Tokens:
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
cdef int push_back(self, int idx, Lexeme* lexeme) except -1:
|
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
self._realloc(self.length * 2)
|
self._realloc(self.length * 2)
|
||||||
self.lex[self.length] = lexeme
|
cdef TokenC* t = &self.data[self.length]
|
||||||
self.idx[self.length] = idx
|
if LexemeOrToken is TokenC_ptr:
|
||||||
self.pos[self.length] = 0
|
t[0] = lex_or_tok[0]
|
||||||
self.ner[self.length] = 0
|
|
||||||
self.length += 1
|
|
||||||
return idx + lexeme.length
|
|
||||||
|
|
||||||
cdef int extend(self, int idx, Lexeme** lexemes, int n) except -1:
|
|
||||||
cdef int i
|
|
||||||
if lexemes == NULL:
|
|
||||||
return idx
|
|
||||||
elif n == 0:
|
|
||||||
i = 0
|
|
||||||
while lexemes[i] != NULL:
|
|
||||||
idx = self.push_back(idx, lexemes[i])
|
|
||||||
i += 1
|
|
||||||
else:
|
else:
|
||||||
for i in range(n):
|
t.lex = lex_or_tok
|
||||||
idx = self.push_back(idx, lexemes[i])
|
self.length += 1
|
||||||
return idx
|
return idx + t.lex.length
|
||||||
|
|
||||||
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
|
@cython.boundscheck(False)
|
||||||
if tag_type == POS:
|
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
|
||||||
self.pos[i] = tag
|
cdef int i, j
|
||||||
elif tag_type == ENTITY:
|
cdef attr_id_t feature
|
||||||
self.ner[i] = tag
|
cdef np.ndarray[long, ndim=2] output
|
||||||
|
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
|
||||||
|
for i in range(self.length):
|
||||||
|
for j, feature in enumerate(attr_ids):
|
||||||
|
output[i, j] = get_attr(self.data[i].lex, feature)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def count_by(self, attr_id_t attr_id):
|
||||||
|
cdef int i
|
||||||
|
cdef attr_t attr
|
||||||
|
cdef size_t count
|
||||||
|
|
||||||
|
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
||||||
|
for i in range(self.length):
|
||||||
|
if attr_id == LEMMA:
|
||||||
|
attr = self.data[i].lemma
|
||||||
|
else:
|
||||||
|
attr = get_attr(self.data[i].lex, attr_id)
|
||||||
|
counts.inc(attr, 1)
|
||||||
|
return dict(counts)
|
||||||
|
|
||||||
def _realloc(self, new_size):
|
def _realloc(self, new_size):
|
||||||
self.max_length = new_size
|
self.max_length = new_size
|
||||||
n = new_size + (PADDING * 2)
|
n = new_size + (PADDING * 2)
|
||||||
self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
|
# What we're storing is a "padded" array. We've jumped forward PADDING
|
||||||
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
|
# places, and are storing the pointer to that. This way, we can access
|
||||||
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
|
# words out-of-bounds, and get out-of-bounds markers.
|
||||||
self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
|
# Now that we want to realloc, we need the address of the true start,
|
||||||
self.lex = self._lex_ptr + PADDING
|
# so we jump the pointer back PADDING places.
|
||||||
self.idx = self._idx_ptr + PADDING
|
cdef TokenC* data_start = self.data - PADDING
|
||||||
self.pos = self._pos_ptr + PADDING
|
data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
|
||||||
self.ner = self._ner_ptr + PADDING
|
self.data = data_start + PADDING
|
||||||
|
cdef int i
|
||||||
for i in range(self.length, self.max_length + PADDING):
|
for i in range(self.length, self.max_length + PADDING):
|
||||||
self.lex[i] = &EMPTY_LEXEME
|
self.data[i].lex = &EMPTY_LEXEME
|
||||||
|
|
||||||
|
|
||||||
@cython.freelist(64)
|
@cython.freelist(64)
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
|
def __init__(self, Language lang, int i, int idx,
|
||||||
dict lex):
|
int pos, int lemma, dict lex):
|
||||||
self._string_store = string_store
|
self.lang = lang
|
||||||
self.idx = idx
|
self.idx = idx
|
||||||
self.pos = pos
|
self.pos = pos
|
||||||
self.ner = ner
|
|
||||||
self.i = i
|
self.i = i
|
||||||
self.id = lex['id']
|
self.id = lex['id']
|
||||||
|
|
||||||
|
self.lemma = lemma
|
||||||
|
|
||||||
self.cluster = lex['cluster']
|
self.cluster = lex['cluster']
|
||||||
self.length = lex['length']
|
self.length = lex['length']
|
||||||
self.postype = lex['postype']
|
self.postype = lex['pos_type']
|
||||||
self.sensetype = lex['supersense']
|
self.sensetype = 0
|
||||||
self.sic = lex['sic']
|
self.sic = lex['sic']
|
||||||
self.norm = lex['norm']
|
self.norm = lex['dense']
|
||||||
self.shape = lex['shape']
|
self.shape = lex['shape']
|
||||||
self.suffix = lex['asciied']
|
self.suffix = lex['suffix']
|
||||||
self.prefix = lex['prefix']
|
self.prefix = lex['prefix']
|
||||||
|
|
||||||
self.prob = lex['prob']
|
self.prob = lex['prob']
|
||||||
|
@ -145,5 +141,16 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.sic == 0:
|
if self.sic == 0:
|
||||||
return ''
|
return ''
|
||||||
cdef bytes utf8string = self._string_store[self.sic]
|
cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
|
||||||
return utf8string.decode('utf8')
|
return utf8string.decode('utf8')
|
||||||
|
|
||||||
|
property lemma:
|
||||||
|
def __get__(self):
|
||||||
|
if self.lemma == 0:
|
||||||
|
return self.string
|
||||||
|
cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
|
||||||
|
return utf8string.decode('utf8')
|
||||||
|
|
||||||
|
property pos:
|
||||||
|
def __get__(self):
|
||||||
|
return self.lang.pos_tagger.tag_names[self.pos]
|
||||||
|
|
|
@ -1,8 +1,20 @@
|
||||||
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
||||||
|
from libc.stdint cimport uint8_t
|
||||||
|
|
||||||
ctypedef uint64_t hash_t
|
ctypedef uint64_t hash_t
|
||||||
ctypedef char* utf8_t
|
ctypedef char* utf8_t
|
||||||
ctypedef uint64_t flag_t
|
ctypedef uint32_t attr_t
|
||||||
|
ctypedef uint64_t flags_t
|
||||||
ctypedef uint32_t id_t
|
ctypedef uint32_t id_t
|
||||||
ctypedef uint16_t len_t
|
ctypedef uint16_t len_t
|
||||||
ctypedef uint16_t tag_t
|
ctypedef uint16_t tag_t
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct Morphology:
|
||||||
|
uint8_t number
|
||||||
|
uint8_t tenspect # Tense/aspect/voice
|
||||||
|
uint8_t mood
|
||||||
|
uint8_t gender
|
||||||
|
uint8_t person
|
||||||
|
uint8_t case
|
||||||
|
uint8_t misc
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from .typedefs cimport utf8_t, id_t, hash_t
|
from .typedefs cimport utf8_t, id_t, hash_t
|
||||||
|
|
||||||
|
@ -11,11 +12,23 @@ cdef struct Utf8Str:
|
||||||
int length
|
int length
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct UniStr:
|
||||||
|
Py_UNICODE* chars
|
||||||
|
size_t n
|
||||||
|
hash_t key
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||||
|
s.chars = &chars[start]
|
||||||
|
s.n = end - start
|
||||||
|
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
|
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef PreshMap table
|
cdef PreshMap _map
|
||||||
cdef Utf8Str* strings
|
cdef Utf8Str* strings
|
||||||
cdef int size
|
cdef int size
|
||||||
cdef int _resize_at
|
cdef int _resize_at
|
||||||
|
|
||||||
cdef Utf8Str* intern(self, char* chars, int length) except NULL
|
cdef const Utf8Str* intern(self, char* chars, int length) except NULL
|
||||||
|
|
|
@ -5,10 +5,11 @@ import codecs
|
||||||
|
|
||||||
SEPARATOR = '\n|-SEP-|\n'
|
SEPARATOR = '\n|-SEP-|\n'
|
||||||
|
|
||||||
|
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.table = PreshMap()
|
self._map = PreshMap()
|
||||||
self._resize_at = 10000
|
self._resize_at = 10000
|
||||||
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||||
self.size = 1
|
self.size = 1
|
||||||
|
@ -17,26 +18,30 @@ cdef class StringStore:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.size-1
|
return self.size-1
|
||||||
|
|
||||||
def __getitem__(self, string_or_id):
|
def __getitem__(self, object string_or_id):
|
||||||
cdef bytes byte_string
|
cdef bytes byte_string
|
||||||
cdef Utf8Str* utf8str
|
cdef const Utf8Str* utf8str
|
||||||
if type(string_or_id) == int or type(string_or_id) == long:
|
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
|
||||||
if string_or_id < 1 or string_or_id >= self.size:
|
if string_or_id < 1 or string_or_id >= self.size:
|
||||||
raise IndexError(string_or_id)
|
raise IndexError(string_or_id)
|
||||||
utf8str = &self.strings[<int>string_or_id]
|
utf8str = &self.strings[<int>string_or_id]
|
||||||
return utf8str.chars[:utf8str.length]
|
return utf8str.chars[:utf8str.length]
|
||||||
elif type(string_or_id) == bytes:
|
elif isinstance(string_or_id, bytes):
|
||||||
utf8str = self.intern(<char*>string_or_id, len(string_or_id))
|
utf8str = self.intern(<char*>string_or_id, len(string_or_id))
|
||||||
return utf8str.i
|
return utf8str.i
|
||||||
|
elif isinstance(string_or_id, unicode):
|
||||||
|
byte_string = string_or_id.encode('utf8')
|
||||||
|
utf8str = self.intern(<char*>byte_string, len(byte_string))
|
||||||
|
return utf8str.i
|
||||||
else:
|
else:
|
||||||
raise TypeError(type(string_or_id))
|
raise TypeError(type(string_or_id))
|
||||||
|
|
||||||
cdef Utf8Str* intern(self, char* chars, int length) except NULL:
|
cdef const Utf8Str* intern(self, char* chars, int length) except NULL:
|
||||||
# 0 means missing, but we don't bother offsetting the index. We waste
|
# 0 means missing, but we don't bother offsetting the index. We waste
|
||||||
# slot 0 to simplify the code, because it doesn't matter.
|
# slot 0 to simplify the code, because it doesn't matter.
|
||||||
assert length != 0
|
assert length != 0
|
||||||
cdef hash_t key = hash64(chars, length * sizeof(char), 0)
|
cdef hash_t key = hash64(chars, length * sizeof(char), 0)
|
||||||
cdef void* value = self.table.get(key)
|
cdef void* value = self._map.get(key)
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
if value == NULL:
|
if value == NULL:
|
||||||
if self.size == self._resize_at:
|
if self.size == self._resize_at:
|
||||||
|
@ -48,7 +53,7 @@ cdef class StringStore:
|
||||||
self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
|
self.strings[i].chars = <char*>self.mem.alloc(length, sizeof(char))
|
||||||
memcpy(self.strings[i].chars, chars, length)
|
memcpy(self.strings[i].chars, chars, length)
|
||||||
self.strings[i].length = length
|
self.strings[i].length = length
|
||||||
self.table.set(key, <void*>self.size)
|
self._map.set(key, <void*>self.size)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
else:
|
else:
|
||||||
i = <size_t>value
|
i = <size_t>value
|
||||||
|
|
|
@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
|
||||||
|
|
||||||
def read_lang_data(name):
|
def read_lang_data(name):
|
||||||
data_dir = path.join(DATA_DIR, name)
|
data_dir = path.join(DATA_DIR, name)
|
||||||
tokenization = read_tokenization(name)
|
with open(path.join(data_dir, 'specials.json')) as file_:
|
||||||
|
tokenization = ujson.load(file_)
|
||||||
prefix = read_prefix(data_dir)
|
prefix = read_prefix(data_dir)
|
||||||
suffix = read_suffix(data_dir)
|
suffix = read_suffix(data_dir)
|
||||||
infix = read_infix(data_dir)
|
infix = read_infix(data_dir)
|
||||||
|
@ -26,12 +27,14 @@ def read_prefix(data_dir):
|
||||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||||
return expression
|
return expression
|
||||||
|
|
||||||
|
|
||||||
def read_suffix(data_dir):
|
def read_suffix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
|
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
||||||
return expression
|
return expression
|
||||||
|
|
||||||
|
|
||||||
def read_infix(data_dir):
|
def read_infix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'infix')) as file_:
|
with utf8open(path.join(data_dir, 'infix')) as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
|
|
|
@ -20,15 +20,18 @@ def test_apostrophe():
|
||||||
def test_LL():
|
def test_LL():
|
||||||
tokens = EN.tokenize("we'll")
|
tokens = EN.tokenize("we'll")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[1].string == "will"
|
assert tokens[1].string == "'ll"
|
||||||
|
assert tokens[1].lemma == "will"
|
||||||
assert tokens[0].string == "we"
|
assert tokens[0].string == "we"
|
||||||
|
|
||||||
|
|
||||||
def test_aint():
|
def test_aint():
|
||||||
tokens = EN.tokenize("ain't")
|
tokens = EN.tokenize("ain't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].string == "are"
|
assert tokens[0].string == "ai"
|
||||||
assert tokens[1].string == "not"
|
assert tokens[0].lemma == "be"
|
||||||
|
assert tokens[1].string == "n't"
|
||||||
|
assert tokens[1].lemma == "not"
|
||||||
|
|
||||||
|
|
||||||
def test_capitalized():
|
def test_capitalized():
|
||||||
|
@ -38,4 +41,12 @@ def test_capitalized():
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
tokens = EN.tokenize("Ain't")
|
tokens = EN.tokenize("Ain't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].string == "Are"
|
assert tokens[0].string == "Ai"
|
||||||
|
assert tokens[0].lemma == "be"
|
||||||
|
|
||||||
|
|
||||||
|
def test_punct():
|
||||||
|
tokens = EN.tokenize("We've")
|
||||||
|
assert len(tokens) == 2
|
||||||
|
tokens = EN.tokenize("``We've")
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
|
@ -27,3 +27,9 @@ def test_tweebo_challenge():
|
||||||
assert tokens[19].string == '")'
|
assert tokens[19].string == '")'
|
||||||
assert tokens[20].string == ':>'
|
assert tokens[20].string == ':>'
|
||||||
assert tokens[21].string == '....'
|
assert tokens[21].string == '....'
|
||||||
|
|
||||||
|
|
||||||
|
def test_false_positive():
|
||||||
|
text = "example:)"
|
||||||
|
tokens = EN.tokenize(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
|
@ -19,8 +19,12 @@ def test_save_bytes(sstore):
|
||||||
|
|
||||||
|
|
||||||
def test_save_unicode(sstore):
|
def test_save_unicode(sstore):
|
||||||
with pytest.raises(TypeError):
|
Hello_i = sstore[u'Hello']
|
||||||
A_i = sstore['A']
|
assert Hello_i == 1
|
||||||
|
assert sstore[u'Hello'] == 1
|
||||||
|
assert sstore[u'goodbye'] != Hello_i
|
||||||
|
assert sstore[u'hello'] != Hello_i
|
||||||
|
assert Hello_i == 1
|
||||||
|
|
||||||
|
|
||||||
def test_zero_id(sstore):
|
def test_zero_id(sstore):
|
||||||
|
|
15
tests/test_iter_lexicon.py
Normal file
15
tests/test_iter_lexicon.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy.en import EN
|
||||||
|
|
||||||
|
def test_range_iter():
|
||||||
|
EN.load()
|
||||||
|
for i in range(len(EN.lexicon)):
|
||||||
|
lex = EN.lexicon[i]
|
||||||
|
|
||||||
|
|
||||||
|
def test_iter():
|
||||||
|
EN.load()
|
||||||
|
i = 0
|
||||||
|
for lex in EN.lexicon:
|
||||||
|
i += 1
|
34
tests/test_lemmatizer.py
Normal file
34
tests/test_lemmatizer.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
||||||
|
from spacy.util import DATA_DIR
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_index():
|
||||||
|
wn = path.join(DATA_DIR, 'wordnet')
|
||||||
|
index = read_index(path.join(wn, 'index.noun'))
|
||||||
|
assert 'man' in index
|
||||||
|
assert 'plantes' not in index
|
||||||
|
assert 'plant' in index
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_exc():
|
||||||
|
wn = path.join(DATA_DIR, 'wordnet')
|
||||||
|
exc = read_exc(path.join(wn, 'verb.exc'))
|
||||||
|
assert exc['was'] == ('be',)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def lemmatizer():
|
||||||
|
return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
|
||||||
|
|
||||||
|
|
||||||
|
def test_noun_lemmas(lemmatizer):
|
||||||
|
do = lemmatizer.noun
|
||||||
|
|
||||||
|
assert do('aardwolves') == set(['aardwolf'])
|
||||||
|
assert do('aardwolf') == set(['aardwolf'])
|
||||||
|
assert do('planets') == set(['planet'])
|
||||||
|
assert do('ring') == set(['ring'])
|
||||||
|
assert do('axes') == set(['axis', 'axe', 'ax'])
|
|
@ -7,6 +7,7 @@ from spacy.lexeme import *
|
||||||
|
|
||||||
|
|
||||||
def test_is_alpha():
|
def test_is_alpha():
|
||||||
|
EN.load()
|
||||||
the = EN.lexicon['the']
|
the = EN.lexicon['the']
|
||||||
assert the['flags'] & (1 << IS_ALPHA)
|
assert the['flags'] & (1 << IS_ALPHA)
|
||||||
year = EN.lexicon['1999']
|
year = EN.lexicon['1999']
|
||||||
|
@ -16,6 +17,7 @@ def test_is_alpha():
|
||||||
|
|
||||||
|
|
||||||
def test_is_digit():
|
def test_is_digit():
|
||||||
|
EN.load()
|
||||||
the = EN.lexicon['the']
|
the = EN.lexicon['the']
|
||||||
assert not the['flags'] & (1 << IS_DIGIT)
|
assert not the['flags'] & (1 << IS_DIGIT)
|
||||||
year = EN.lexicon['1999']
|
year = EN.lexicon['1999']
|
||||||
|
|
|
@ -1,11 +0,0 @@
|
||||||
from spacy import util
|
|
||||||
|
|
||||||
|
|
||||||
def test_load_en():
|
|
||||||
rules = util.read_tokenization('en')
|
|
||||||
assert len(rules) != 0
|
|
||||||
aint = [rule for rule in rules if rule[0] == "ain't"][0]
|
|
||||||
chunk, pieces = aint
|
|
||||||
assert chunk == "ain't"
|
|
||||||
assert pieces[0] == "are"
|
|
||||||
assert pieces[1] == "not"
|
|
|
@ -34,7 +34,7 @@ def test_digits():
|
||||||
def test_contraction():
|
def test_contraction():
|
||||||
tokens = EN.tokenize("don't giggle")
|
tokens = EN.tokenize("don't giggle")
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[1].sic == EN.lexicon["not"]['sic']
|
assert tokens[1].sic == EN.lexicon["n't"]['sic']
|
||||||
tokens = EN.tokenize("i said don't!")
|
tokens = EN.tokenize("i said don't!")
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[4].sic == EN.lexicon['!']['sic']
|
assert tokens[4].sic == EN.lexicon['!']['sic']
|
||||||
|
@ -71,30 +71,39 @@ def test_cnts1():
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 8
|
assert len(tokens) == 8
|
||||||
|
|
||||||
|
|
||||||
def test_cnts2():
|
def test_cnts2():
|
||||||
text = u"""U.N. regulations are not a part of their concern."""
|
text = u"""U.N. regulations are not a part of their concern."""
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 10
|
assert len(tokens) == 10
|
||||||
|
|
||||||
|
|
||||||
def test_cnts3():
|
def test_cnts3():
|
||||||
text = u"“Isn't it?”"
|
text = u"“Isn't it?”"
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 6
|
words = [t.string for t in tokens]
|
||||||
|
assert len(words) == 6
|
||||||
|
|
||||||
|
|
||||||
def test_cnts4():
|
def test_cnts4():
|
||||||
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 15
|
words = [t.string for t in tokens]
|
||||||
|
assert len(words) == 15
|
||||||
|
|
||||||
|
|
||||||
def test_cnts5():
|
def test_cnts5():
|
||||||
text = """'Me too!', Mr. P. Delaware cried. """
|
text = """'Me too!', Mr. P. Delaware cried. """
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 11
|
assert len(tokens) == 11
|
||||||
|
|
||||||
|
|
||||||
def test_cnts6():
|
def test_cnts6():
|
||||||
text = u'They ran about 10km.'
|
text = u'They ran about 10km.'
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) == 6
|
words = [t.string for t in tokens]
|
||||||
|
assert len(words) == 6
|
||||||
|
|
||||||
|
|
||||||
#def test_cnts7():
|
#def test_cnts7():
|
||||||
# text = 'But then the 6,000-year ice age came...'
|
# text = 'But then the 6,000-year ice age came...'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user