This commit is contained in:
Matthew Honnibal 2015-09-14 09:53:33 +02:00
commit 50a7c41429
7 changed files with 220 additions and 2 deletions

View File

@ -6,7 +6,39 @@ from ..language import Language
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
# improved list from Stone, Denis, Kwantes (2010)
STOPWORDS = """
a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be
became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can
cannot cant co computer con could couldnt cry de describe
detail did didn do does doesn doing don done down due during
each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen
fify fill find fire first five for former formerly forty found four from front full further get give go
had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie
if in inc indeed interest into is it its itself keep last latter latterly least less ltd
just
kg km
made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely
neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off
often on once one only onto or other others otherwise our ours ourselves out over own part per
perhaps please put rather re
quite
rather really regarding
same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten
than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under
until up unless upon us used using
various very very via
was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you
your yours yourself yourselves
"""
STOPWORDS = set(w for w in STOPWORDS.split() if w)
class English(Language): class English(Language):
@classmethod @classmethod
def default_data_dir(cls): def default_data_dir(cls):
return LOCAL_DATA_DIR return LOCAL_DATA_DIR
@staticmethod
def is_stop(string):
return 1 if string.lower() in STOPWORDS else 0

View File

@ -94,6 +94,10 @@ class Language(object):
def like_email(string): def like_email(string):
return orth.like_email(string) return orth.like_email(string)
@staticmethod
def is_stop(string):
return 0
@classmethod @classmethod
def default_lex_attrs(cls, data_dir=None): def default_lex_attrs(cls, data_dir=None):
return { return {
@ -116,7 +120,7 @@ class Language(object):
attrs.LIKE_URL: cls.like_url, attrs.LIKE_URL: cls.like_url,
attrs.LIKE_NUM: cls.like_number, attrs.LIKE_NUM: cls.like_number,
attrs.LIKE_EMAIL: cls.like_email, attrs.LIKE_EMAIL: cls.like_email,
attrs.IS_STOP: lambda string: False, attrs.IS_STOP: cls.is_stop,
attrs.IS_OOV: lambda string: True attrs.IS_OOV: lambda string: True
} }

View File

@ -3,6 +3,13 @@ from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()
from libc.string cimport memset from libc.string cimport memset
from .orth cimport word_shape from .orth cimport word_shape
@ -35,6 +42,26 @@ cdef class Lexeme:
def py_check_flag(self, attr_id_t flag_id): def py_check_flag(self, attr_id_t flag_id):
return True if Lexeme.check_flag(self.c, flag_id) else False return True if Lexeme.check_flag(self.c, flag_id) else False
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property vector_norm:
def __get__(self):
return self.c.l2_norm
def __set__(self, float value):
self.c.l2_norm = value
property vector:
def __get__(self):
cdef int length = self.vocab.repvec_length
repvec_view = <float[:length,]>self.c.repvec
return numpy.asarray(repvec_view)
property repvec:
def __get__(self):
return self.vector
property orth_: property orth_:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.orth] return self.vocab.strings[self.c.orth]

View File

@ -3,7 +3,9 @@ from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
import numpy import numpy
import numpy.linalg
import struct import struct
cimport numpy as np
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME from ..lexeme cimport EMPTY_LEXEME
@ -118,6 +120,22 @@ cdef class Doc:
def __str__(self): def __str__(self):
return u''.join([t.string for t in self]) return u''.join([t.string for t in self])
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property repvec:
def __get__(self):
return self.vector
property vector:
def __get__(self):
return sum(t.vector for t in self if not t.is_stop) / len(self)
property vector_norm:
def __get__(self):
return numpy.linalg.norm(self.vector)
@property @property
def string(self): def string(self):
return u''.join([t.string for t in self]) return u''.join([t.string for t in self])

View File

@ -1,5 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import defaultdict from collections import defaultdict
import numpy
import numpy.linalg
cimport numpy as np
from ..structs cimport TokenC, LexemeC from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t from ..typedefs cimport flags_t, attr_t
@ -52,6 +55,17 @@ cdef class Span:
def merge(self, unicode tag, unicode lemma, unicode ent_type): def merge(self, unicode tag, unicode lemma, unicode ent_type):
self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type) self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property vector:
def __get__(self):
return sum(t.vector for t in self if not t.is_stop) / len(self)
property vector_norm:
def __get__(self):
return numpy.linalg.norm(self.vector)
property text: property text:
def __get__(self): def __get__(self):
return u' '.join([t.text for t in self]) return u' '.join([t.text for t in self])

View File

@ -49,6 +49,9 @@ cdef class Token:
def nbor(self, int i=1): def nbor(self, int i=1):
return self.doc[self.i+i] return self.doc[self.i+i]
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property lex_id: property lex_id:
def __get__(self): def __get__(self):
return self.c.lex.id return self.c.lex.id
@ -125,12 +128,20 @@ cdef class Token:
def __get__(self): def __get__(self):
return self.c.dep return self.c.dep
property repvec: property vector:
def __get__(self): def __get__(self):
cdef int length = self.vocab.repvec_length cdef int length = self.vocab.repvec_length
repvec_view = <float[:length,]>self.c.lex.repvec repvec_view = <float[:length,]>self.c.lex.repvec
return numpy.asarray(repvec_view) return numpy.asarray(repvec_view)
property repvec:
def __get__(self):
return self.vector
property vector_norm:
def __get__(self):
return self.c.lex.l2_norm
property n_lefts: property n_lefts:
def __get__(self): def __get__(self):
cdef int n = 0 cdef int n = 0
@ -302,6 +313,9 @@ cdef class Token:
property is_oov: property is_oov:
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
property is_stop:
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_STOP)
property is_alpha: property is_alpha:
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)

View File

@ -0,0 +1,109 @@
import pytest
@pytest.mark.models
def test_token_vector(EN):
token = EN(u'Apples and oranges')[0]
token.vector
token.vector_norm
@pytest.mark.models
def test_lexeme_vector(EN):
lexeme = EN.vocab[u'apples']
lexeme.vector
lexeme.vector_norm
@pytest.mark.models
def test_doc_vector(EN):
doc = EN(u'Apples and oranges')
doc.vector
doc.vector_norm
@pytest.mark.models
def test_span_vector(EN):
span = EN(u'Apples and oranges')[0:2]
span.vector
span.vector_norm
@pytest.mark.models
def test_token_token_similarity(EN):
apples, oranges = EN(u'apples oranges')
assert apples.similarity(oranges) == oranges.similarity(apples)
assert 0.0 < apples.similarity(oranges) < 1.0
@pytest.mark.models
def test_token_lexeme_similarity(EN):
apples = EN(u'apples')
oranges = EN.vocab[u'oranges']
assert apples.similarity(oranges) == oranges.similarity(apples)
assert 0.0 < apples.similarity(oranges) < 1.0
@pytest.mark.models
def test_token_span_similarity(EN):
doc = EN(u'apples orange juice')
apples = doc[0]
oranges = doc[1:3]
assert apples.similarity(oranges) == oranges.similarity(apples)
assert 0.0 < apples.similarity(oranges) < 1.0
@pytest.mark.models
def test_token_doc_similarity(EN):
doc = EN(u'apples orange juice')
apples = doc[0]
assert apples.similarity(doc) == doc.similarity(apples)
assert 0.0 < apples.similarity(doc) < 1.0
@pytest.mark.models
def test_lexeme_span_similarity(EN):
doc = EN(u'apples orange juice')
apples = EN.vocab[u'apples']
span = doc[1:3]
assert apples.similarity(span) == span.similarity(apples)
assert 0.0 < apples.similarity(span) < 1.0
@pytest.mark.models
def test_lexeme_lexeme_similarity(EN):
apples = EN.vocab[u'apples']
oranges = EN.vocab[u'oranges']
assert apples.similarity(oranges) == oranges.similarity(apples)
assert 0.0 < apples.similarity(oranges) < 1.0
@pytest.mark.models
def test_lexeme_doc_similarity(EN):
doc = EN(u'apples orange juice')
apples = EN.vocab[u'apples']
assert apples.similarity(doc) == doc.similarity(apples)
assert 0.0 < apples.similarity(doc) < 1.0
@pytest.mark.models
def test_span_span_similarity(EN):
doc = EN(u'apples orange juice')
apples = doc[0:2]
oj = doc[1:3]
assert apples.similarity(oj) == oj.similarity(apples)
assert 0.0 < apples.similarity(oj) < 1.0
@pytest.mark.models
def test_span_doc_similarity(EN):
doc = EN(u'apples orange juice')
apples = doc[0:2]
oj = doc[1:3]
assert apples.similarity(doc) == doc.similarity(apples)
assert 0.0 < apples.similarity(doc) < 1.0
@pytest.mark.models
def test_doc_doc_similarity(EN):
apples = EN(u'apples and apple pie')
oranges = EN(u'orange juice')
assert apples.similarity(oranges) == apples.similarity(oranges)
assert 0.0 < apples.similarity(oranges) < 1.0