mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Initial commit. Tests passing for punctuation handling. Need contractions, file transport, tokenize function, etc.
This commit is contained in:
parent
5c1705d5be
commit
556f6a18ca
28
spacy/__init__.py
Normal file
28
spacy/__init__.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
from .lexeme import lex_of
|
||||
from .lexeme import sic_of
|
||||
|
||||
|
||||
__all__ = [lex_of, sic_of]
|
||||
|
||||
|
||||
"""
|
||||
from .tokens import ids_from_string
|
||||
from .tokens import group_by
|
||||
|
||||
from .lex import sic_of
|
||||
from .lex import lex_of
|
||||
from .lex import normed_of
|
||||
from .lex import first_of
|
||||
from .lex import last_three_of
|
||||
|
||||
from .lex import cluster_of
|
||||
from .lex import prob_of
|
||||
|
||||
from .lex import is_oft_upper
|
||||
from .lex import is_oft_title
|
||||
|
||||
from .lex import can_noun
|
||||
from .lex import can_verb
|
||||
from .lex import can_adj
|
||||
from .lex import can_adv
|
||||
"""
|
4529
spacy/en.cpp
Normal file
4529
spacy/en.cpp
Normal file
File diff suppressed because it is too large
Load Diff
17
spacy/en.pxd
Normal file
17
spacy/en.pxd
Normal file
|
@ -0,0 +1,17 @@
|
|||
from ext.sparsehash cimport dense_hash_map
|
||||
from spacy.lexeme cimport StringHash
|
||||
from spacy.lexeme cimport Lexeme
|
||||
|
||||
|
||||
ctypedef Py_UNICODE* string_ptr
|
||||
ctypedef size_t Lexeme_addr # For python interop
|
||||
ctypedef Lexeme* Lexeme_ptr
|
||||
|
||||
|
||||
cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES
|
||||
|
||||
|
||||
cpdef Lexeme_addr lookup(unicode word) except 0
|
||||
cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0
|
||||
cdef StringHash hash_string(unicode s, size_t length) except 0
|
||||
cpdef unicode unhash(StringHash hash_value)
|
165
spacy/en.pyx
Normal file
165
spacy/en.pyx
Normal file
|
@ -0,0 +1,165 @@
|
|||
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
||||
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
||||
boldly assume no collisions.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
from ext.murmurhash cimport MurmurHash64A
|
||||
from ext.murmurhash cimport MurmurHash64B
|
||||
|
||||
|
||||
STRINGS = {}
|
||||
LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]()
|
||||
LEXEMES.set_empty_key(0)
|
||||
|
||||
|
||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
|
||||
|
||||
|
||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||
'''.. function:: enumerate(sequence[, start=0])
|
||||
Fetch a Lexeme representing a word string. If the word has not been seen,
|
||||
construct one, splitting off any attached punctuation or clitics. A
|
||||
reference to BLANK_WORD is returned for the empty string.
|
||||
|
||||
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
|
||||
'''
|
||||
if string == '':
|
||||
return <Lexeme_addr>&BLANK_WORD
|
||||
cdef size_t length = len(string)
|
||||
cdef StringHash hashed = hash_string(string, length)
|
||||
cdef Lexeme* word_ptr = LEXEMES[hashed]
|
||||
cdef size_t n
|
||||
if word_ptr == NULL:
|
||||
word_ptr = _add(hashed, string, _find_split(string, length), length)
|
||||
return <Lexeme_addr>word_ptr
|
||||
|
||||
|
||||
cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0:
|
||||
'''Fetch a Lexeme representing a word string. If the word has not been seen,
|
||||
construct one, given the specified start and end indices. A negative index
|
||||
significes 0 for start, and the string length for end --- i.e. the string
|
||||
will not be sliced if start == -1 and end == -1.
|
||||
|
||||
A reference to BLANK_WORD is returned for the empty string.
|
||||
'''
|
||||
if string == '':
|
||||
return <Lexeme_addr>&BLANK_WORD
|
||||
cdef size_t length = len(string)
|
||||
cdef StringHash hashed = hash_string(string, length)
|
||||
cdef Lexeme* chunk_ptr = LEXEMES[hashed]
|
||||
if chunk_ptr == NULL:
|
||||
chunk_ptr = _add(hashed, string, start, length)
|
||||
return <Lexeme_addr>chunk_ptr
|
||||
|
||||
|
||||
cdef StringHash hash_string(unicode s, size_t length) except 0:
|
||||
'''Hash unicode with MurmurHash64A'''
|
||||
assert length
|
||||
return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0)
|
||||
|
||||
|
||||
cpdef unicode unhash(StringHash hash_value):
|
||||
'''Fetch a string from the reverse index, given its hash value.'''
|
||||
cdef string_ptr string = STRINGS[hash_value]
|
||||
if string == NULL:
|
||||
raise ValueError(hash_value)
|
||||
|
||||
return string
|
||||
|
||||
|
||||
cdef unicode normalize_word_string(unicode word):
|
||||
'''Return a normalized version of the word, mapping:
|
||||
- 4 digit strings into !YEAR
|
||||
- Other digit strings into !DIGITS
|
||||
- All other strings into lower-case
|
||||
'''
|
||||
cdef unicode s
|
||||
if word.isdigit() and len(word) == 4:
|
||||
return '!YEAR'
|
||||
elif word[0].isdigit():
|
||||
return '!DIGITS'
|
||||
else:
|
||||
return word.lower()
|
||||
|
||||
|
||||
cpdef unicode _substr(unicode string, int start, int end, size_t length):
|
||||
if end >= length:
|
||||
end = -1
|
||||
if start >= length:
|
||||
start = 0
|
||||
if start <= 0 and end < 0:
|
||||
return string
|
||||
elif start < 0:
|
||||
start = 0
|
||||
elif end < 0:
|
||||
end = length
|
||||
return string[start:end]
|
||||
|
||||
|
||||
cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL:
|
||||
assert string
|
||||
assert split <= length
|
||||
word = _init_lexeme(string, hashed, split, length)
|
||||
LEXEMES[hashed] = word
|
||||
STRINGS[hashed] = string
|
||||
return word
|
||||
|
||||
|
||||
cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
|
||||
int split, size_t length) except NULL:
|
||||
assert split <= length
|
||||
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
||||
|
||||
word.first = <Py_UNICODE>(string[0] if string else 0)
|
||||
word.sic = hashed
|
||||
|
||||
cdef unicode tail_string
|
||||
cdef unicode lex
|
||||
if split != 0 and split < length:
|
||||
lex = _substr(string, 0, split, length)
|
||||
tail_string = _substr(string, split, length, length)
|
||||
else:
|
||||
lex = string
|
||||
tail_string = ''
|
||||
assert lex
|
||||
cdef unicode normed = normalize_word_string(lex)
|
||||
cdef unicode last3 = _substr(string, length - 3, length, length)
|
||||
|
||||
assert normed
|
||||
assert len(normed)
|
||||
|
||||
word.lex = hash_string(lex, len(lex))
|
||||
word.normed = hash_string(normed, len(normed))
|
||||
word.last3 = hash_string(last3, len(last3))
|
||||
|
||||
STRINGS[word.lex] = lex
|
||||
STRINGS[word.normed] = normed
|
||||
STRINGS[word.last3] = last3
|
||||
|
||||
# These are loaded later
|
||||
word.prob = 0
|
||||
word.cluster = 0
|
||||
word.oft_upper = False
|
||||
word.oft_title = False
|
||||
|
||||
# Now recurse, and deal with the tail
|
||||
if tail_string:
|
||||
word.tail = <Lexeme*>lookup(tail_string)
|
||||
return word
|
||||
|
||||
|
||||
cdef size_t _find_split(unicode word, size_t length):
|
||||
cdef size_t i = 0
|
||||
if word[0].isalnum():
|
||||
while i < length and word[i].isalnum():
|
||||
i += 1
|
||||
else:
|
||||
# Split off a punctuation character, or a sequence of the same punctuation character
|
||||
while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]):
|
||||
i += 1
|
||||
return i
|
2433
spacy/lexeme.cpp
Normal file
2433
spacy/lexeme.cpp
Normal file
File diff suppressed because it is too large
Load Diff
35
spacy/lexeme.pxd
Normal file
35
spacy/lexeme.pxd
Normal file
|
@ -0,0 +1,35 @@
|
|||
from libc.stdint cimport uint64_t
|
||||
|
||||
|
||||
ctypedef int ClusterID
|
||||
ctypedef uint64_t StringHash
|
||||
|
||||
|
||||
cdef struct Lexeme:
|
||||
StringHash sic # Hash of the original string
|
||||
StringHash lex # Hash of the word, with punctuation and clitics split off
|
||||
StringHash normed # Hash of the normalized version of lex
|
||||
StringHash last3 # Last 3 characters of the token
|
||||
Py_UNICODE first # First character of the token
|
||||
|
||||
double prob # What is the log probability of the lex value?
|
||||
ClusterID cluster # Brown cluster of the token
|
||||
|
||||
bint oft_upper # Is the lowered version of the lex value often in all caps?
|
||||
bint oft_title # Is the lowered version of the lex value often title-cased?
|
||||
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
||||
|
||||
|
||||
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
|
||||
# has a conditional to pick out the correct item. This allows safe iteration
|
||||
# over the Lexeme, via:
|
||||
# for field in range(LexAttr.n): get_attr(Lexeme*, field)
|
||||
cdef enum HashFields:
|
||||
sic
|
||||
lex
|
||||
normed
|
||||
cluster
|
||||
n
|
||||
|
||||
|
||||
#cdef uint64_t get_attr(Lexeme* word, HashFields attr)
|
114
spacy/lexeme.pyx
Normal file
114
spacy/lexeme.pyx
Normal file
|
@ -0,0 +1,114 @@
|
|||
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
|
||||
Mostly useful from Python-space. From Cython-space, you can just cast to
|
||||
Lexeme* yourself.
|
||||
'''
|
||||
|
||||
|
||||
cpdef StringHash sic_of(size_t lex_id) except 0:
|
||||
'''Access the `sic' field of the Lexeme pointed to by lex_id.
|
||||
|
||||
The sic field stores the hash of the whitespace-delimited string-chunk used to
|
||||
construct the Lexeme.
|
||||
|
||||
>>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]
|
||||
[u'Hi!', u'', u'world]
|
||||
'''
|
||||
return (<Lexeme*>lex_id).sic
|
||||
|
||||
|
||||
cpdef StringHash lex_of(size_t lex_id) except 0:
|
||||
'''Access the `lex' field of the Lexeme pointed to by lex_id.
|
||||
|
||||
The lex field is the hash of the string you would expect to get back from
|
||||
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
|
||||
delimited tokens split off. The other fields refer to properties of the
|
||||
string that the lex field stores a hash of, except sic and tail.
|
||||
|
||||
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
|
||||
[u'Hi', u'!', u'world']
|
||||
'''
|
||||
return (<Lexeme*>lex_id).lex
|
||||
|
||||
|
||||
cpdef ClusterID cluster_of(size_t lex_id):
|
||||
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
||||
gives an integer representation of the cluster ID of the word,
|
||||
which should be understood as a binary address:
|
||||
|
||||
>>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
|
||||
>>> token_ids = [lookup(s) for s in strings]
|
||||
>>> clusters = [cluster_of(t) for t in token_ids]
|
||||
>>> print ["{0:b"} % cluster_of(t) for t in token_ids]
|
||||
["100111110110", "100111100100", "01010111011001", "100111110110"]
|
||||
|
||||
The clusterings are unideal, but often slightly useful.
|
||||
"pineapple" and "apple" share a long prefix, indicating a similar meaning,
|
||||
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||
the same cluster ID as "pineapple", which is not what we'd like.
|
||||
'''
|
||||
return (<Lexeme*>lex_id).cluster
|
||||
|
||||
|
||||
cpdef Py_UNICODE first_of(size_t lex_id):
|
||||
'''Access the `first' field of the Lexeme pointed to by lex_id, which
|
||||
stores the first character of the lex string of the word.
|
||||
|
||||
>>> lex_id = lookup(u'Hello')
|
||||
>>> unhash(first_of(lex_id))
|
||||
u'H'
|
||||
'''
|
||||
return (<Lexeme*>lex_id).first
|
||||
|
||||
|
||||
cpdef double prob_of(size_t lex_id):
|
||||
'''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
|
||||
the smoothed unigram log probability of the word, as estimated from a large
|
||||
text corpus. By default, probabilities are based on counts from Gigaword,
|
||||
smoothed using Knesser-Ney; but any probabilities file can be supplied to
|
||||
load_probs.
|
||||
|
||||
>>> prob_of(lookup(u'world'))
|
||||
-20.10340371976182
|
||||
'''
|
||||
pass
|
||||
|
||||
|
||||
cpdef StringHash last3_of(size_t lex_id):
|
||||
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
|
||||
the hash of the last three characters of the word:
|
||||
|
||||
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
|
||||
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
||||
[u'llo', u'!']
|
||||
'''
|
||||
return (<Lexeme*>lex_id).last3
|
||||
|
||||
|
||||
cpdef bint is_oft_upper(size_t lex_id):
|
||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||
stores whether the lowered version of the string hashed by `lex' is found
|
||||
in all-upper case frequently in a large sample of text. Users are free
|
||||
to load different data, by default we use a sample from Wikipedia, with
|
||||
a threshold of 0.95, picked to maximize mutual information for POS tagging.
|
||||
|
||||
>>> is_oft_upper(lookup(u'abc'))
|
||||
True
|
||||
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
||||
True
|
||||
'''
|
||||
return (<Lexeme*>lex_id).oft_upper
|
||||
|
||||
|
||||
cpdef bint is_oft_title(size_t lex_id):
|
||||
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
|
||||
stores whether the lowered version of the string hashed by `lex' is found
|
||||
title-cased frequently in a large sample of text. Users are free
|
||||
to load different data, by default we use a sample from Wikipedia, with
|
||||
a threshold of 0.3, picked to maximize mutual information for POS tagging.
|
||||
|
||||
>>> is_oft_title(lookup(u'marcus'))
|
||||
True
|
||||
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
||||
True
|
||||
'''
|
||||
return (<Lexeme*>lex_id).oft_title
|
2064
spacy/spacy.cpp
Normal file
2064
spacy/spacy.cpp
Normal file
File diff suppressed because it is too large
Load Diff
5
spacy/spacy.pxd
Normal file
5
spacy/spacy.pxd
Normal file
|
@ -0,0 +1,5 @@
|
|||
from libcpp.vector cimport vector
|
||||
from spacy.lexeme cimport Lexeme
|
||||
|
||||
|
||||
cpdef vector[size_t] expand_chunk(size_t addr) except *
|
72
spacy/spacy.pyx
Normal file
72
spacy/spacy.pyx
Normal file
|
@ -0,0 +1,72 @@
|
|||
from __future__ import unicode_literals
|
||||
from spacy.lexeme cimport Lexeme
|
||||
|
||||
|
||||
cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
||||
cdef vector[size_t] tokens = vector[size_t]()
|
||||
word = <Lexeme*>addr
|
||||
while word is not NULL:
|
||||
tokens.push_back(<size_t>word)
|
||||
word = word.tail
|
||||
return tokens
|
||||
|
||||
|
||||
"""
|
||||
cpdef vector[size_t] ids_from_text(unicode text) except *:
|
||||
cdef size_t length = len(text)
|
||||
cdef Py_UNICODE* characters = <Py_UNICODE*>text
|
||||
|
||||
cdef size_t i
|
||||
cdef Py_UNICODE c
|
||||
|
||||
cdef vector[size_t] tokens = vector[size_t]()
|
||||
cdef unicode current = u''
|
||||
cdef Lexeme* token
|
||||
cdef int alnum_end = -1
|
||||
cdef size_t alnum_start = 0
|
||||
cdef bint seen_alnum = False
|
||||
for i in range(length):
|
||||
c = characters[i]
|
||||
if is_whitespace(c):
|
||||
token = <Lexeme*>lookup(current)
|
||||
tokens.push_back(<size_t>token)
|
||||
clitic = 0
|
||||
while token.clitics[clitic]:
|
||||
tokens.push_back(token.clitics[clitic])
|
||||
clitic += 1
|
||||
current = u''
|
||||
alnum_start = 0
|
||||
alnum_end = -1
|
||||
seen_alnum = False
|
||||
else:
|
||||
if not seen_alnum and c.isalnum():
|
||||
alnum_start = i
|
||||
seen_alnum = True
|
||||
elif seen_alnum and alnum_end == -1 and not c.isalnum():
|
||||
alnum_end = i
|
||||
current += c
|
||||
if current:
|
||||
token = <Lexeme*>lookup(current)
|
||||
tokens.push_back(<size_t>token)
|
||||
clitic = 0
|
||||
while token.clitics[clitic]:
|
||||
tokens.push_back(token.clitics[clitic])
|
||||
clitic += 1
|
||||
return tokens
|
||||
"""
|
||||
|
||||
#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *:
|
||||
# pass
|
||||
|
||||
|
||||
cdef inline bint is_whitespace(Py_UNICODE c):
|
||||
# TODO: Support other unicode spaces
|
||||
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
|
||||
if c == u' ':
|
||||
return True
|
||||
elif c == u'\n':
|
||||
return True
|
||||
elif c == u'\t':
|
||||
return True
|
||||
else:
|
||||
return False
|
75
spacy/util.py
Normal file
75
spacy/util.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
def utf8open(loc, mode='r'):
|
||||
return codecs.open(loc, mode, 'utf8')
|
||||
|
||||
|
||||
def load_case_stats(data_dir):
|
||||
case_loc = path.join(data_dir, 'english.case')
|
||||
case_stats = {}
|
||||
with utf8open(case_loc) as cases_file:
|
||||
for line in cases_file:
|
||||
word, upper, title = line.split()
|
||||
case_stats[word] = (float(upper), float(title))
|
||||
return case_stats
|
||||
|
||||
|
||||
def load_clitics(data_dir):
|
||||
clitics_loc = path.join(data_dir, 'clitics.txt')
|
||||
entries = []
|
||||
seen = set()
|
||||
with utf8open(clitics_loc) as clitics_file:
|
||||
for line in clitics_file:
|
||||
line = line.strip()
|
||||
if line.startswith('#'):
|
||||
continue
|
||||
if not line:
|
||||
continue
|
||||
clitics = line.split()
|
||||
word = clitics.pop(0)
|
||||
norm_form = clitics.pop(0)
|
||||
assert word not in seen, word
|
||||
seen.add(word)
|
||||
entries.append((word, norm_form, clitics))
|
||||
return entries
|
||||
|
||||
|
||||
"""
|
||||
def load_browns(self, data_dir):
|
||||
cdef Lexeme* w
|
||||
case_stats = load_case_stats(data_dir)
|
||||
brown_loc = path.join(data_dir, 'bllip-clusters')
|
||||
assert path.exists(brown_loc)
|
||||
cdef size_t start
|
||||
cdef int end
|
||||
with utf8open(brown_loc) as browns_file:
|
||||
for i, line in enumerate(browns_file):
|
||||
cluster_str, word, freq_str = line.split()
|
||||
# Decode as a little-endian string, so that we can do & 15 to get
|
||||
# the first 4 bits. See redshift._parse_features.pyx
|
||||
cluster = int(cluster_str[::-1], 2)
|
||||
upper_pc, title_pc = case_stats.get(word.lower(), (0.0, 0.0))
|
||||
start = 0
|
||||
end = -1
|
||||
find_slice(&start, &end, word)
|
||||
print "Load", repr(word), start, end
|
||||
w = <Lexeme*>init_word(word, start, end, cluster,
|
||||
upper_pc, title_pc, int(freq_str))
|
||||
self.words[_hash_str(word)] = <size_t>w
|
||||
self.strings[<size_t>w] = word
|
||||
|
||||
def load_clitics(self, data_dir):
|
||||
cdef unicode orig_str
|
||||
cdef unicode clitic
|
||||
for orig_str, norm_form, clitic_strs in util.load_clitics(data_dir):
|
||||
w = init_clitic(orig_str, <Lexeme*>self.lookup_slice(norm_form, 0, -1))
|
||||
self.words[w.orig] = <size_t>w
|
||||
self.strings[<size_t>w] = orig_str
|
||||
assert len(clitic_strs) < MAX_CLITICS
|
||||
assert clitic_strs
|
||||
for i, clitic in enumerate(clitic_strs):
|
||||
# If we write punctuation here, assume we want to keep it,
|
||||
# so tell it the slice boundaries (the full string)
|
||||
w.clitics[i] = self.lookup_slice(clitic, 0, -1)
|
||||
# Ensure we null terminate
|
||||
w.clitics[i+1] = 0
|
||||
"""
|
||||
|
BIN
tests/.test_tokenizer.py.swo
Normal file
BIN
tests/.test_tokenizer.py.swo
Normal file
Binary file not shown.
0
tests/my_test.py
Normal file
0
tests/my_test.py
Normal file
48
tests/test_post_punct.py
Normal file
48
tests/test_post_punct.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy import lex_of
|
||||
from spacy.spacy import expand_chunk
|
||||
from spacy.en import lookup
|
||||
from spacy.en import unhash
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def close_puncts():
|
||||
return [')', ']', '}', '*']
|
||||
|
||||
|
||||
def test_close(close_puncts):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p
|
||||
token = lookup(string)
|
||||
assert unhash(lex_of(token)) == word_str
|
||||
tokens = expand_chunk(token)
|
||||
assert len(tokens) == 2
|
||||
assert unhash(lex_of(tokens[0])) == word_str
|
||||
assert unhash(lex_of(tokens[1])) == p
|
||||
|
||||
|
||||
def test_two_different_close(close_puncts):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p + "'"
|
||||
token = lookup(string)
|
||||
assert unhash(lex_of(token)) == word_str
|
||||
tokens = expand_chunk(token)
|
||||
assert len(tokens) == 3
|
||||
assert unhash(lex_of(tokens[0])) == word_str
|
||||
assert unhash(lex_of(tokens[1])) == p
|
||||
assert unhash(lex_of(tokens[2])) == "'"
|
||||
|
||||
|
||||
def test_three_same_close(close_puncts):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p + p + p
|
||||
tokens = expand_chunk(lookup(string))
|
||||
assert len(tokens) == 2
|
||||
assert unhash(lex_of(tokens[0])) == word_str
|
||||
assert unhash(lex_of(tokens[1])) == p + p + p
|
50
tests/test_pre_punct.py
Normal file
50
tests/test_pre_punct.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy import lex_of
|
||||
from spacy.spacy import expand_chunk
|
||||
from spacy.en import lookup
|
||||
from spacy.en import unhash
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def open_puncts():
|
||||
return ['(', '[', '{', '*']
|
||||
|
||||
|
||||
def test_open(open_puncts):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + word_str
|
||||
token = lookup(string)
|
||||
assert unhash(lex_of(token)) == p
|
||||
tokens = expand_chunk(token)
|
||||
assert len(tokens) == 2
|
||||
assert unhash(lex_of(tokens[0])) == p
|
||||
assert unhash(lex_of(tokens[1])) == word_str
|
||||
|
||||
|
||||
def test_two_different_open(open_puncts):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + "`" + word_str
|
||||
token = lookup(string)
|
||||
assert unhash(lex_of(token)) == p
|
||||
tokens = expand_chunk(token)
|
||||
assert len(tokens) == 3
|
||||
assert unhash(lex_of(tokens[0])) == p
|
||||
assert unhash(lex_of(tokens[1])) == "`"
|
||||
assert unhash(lex_of(tokens[2])) == word_str
|
||||
|
||||
|
||||
def test_three_same_open(open_puncts):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + p + p + word_str
|
||||
token = lookup(string)
|
||||
assert unhash(lex_of(token)) == p + p + p
|
||||
tokens = expand_chunk(token)
|
||||
assert len(tokens) == 2
|
||||
assert unhash(lex_of(tokens[0])) == p + p + p
|
||||
assert unhash(lex_of(tokens[1])) == word_str
|
39
tests/test_surround_punct.py
Normal file
39
tests/test_surround_punct.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy import lex_of, sic_of
|
||||
from spacy.spacy import expand_chunk
|
||||
from spacy.en import lookup
|
||||
from spacy.en import unhash
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def paired_puncts():
|
||||
return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
|
||||
|
||||
def test_token(paired_puncts):
|
||||
word_str = 'Hello'
|
||||
for open_, close_ in paired_puncts:
|
||||
string = open_ + word_str + close_
|
||||
tokens = expand_chunk(lookup(string))
|
||||
assert len(tokens) == 3
|
||||
assert unhash(lex_of(tokens[0])) == open_
|
||||
assert unhash(lex_of(tokens[1])) == word_str
|
||||
assert unhash(lex_of(tokens[2])) == close_
|
||||
assert unhash(sic_of(tokens[0])) == string
|
||||
|
||||
|
||||
def test_two_different(paired_puncts):
|
||||
word_str = 'Hello'
|
||||
for open_, close_ in paired_puncts:
|
||||
string = "`" + open_ + word_str + close_ + "'"
|
||||
tokens = expand_chunk(lookup(string))
|
||||
assert len(tokens) == 5
|
||||
assert unhash(lex_of(tokens[0])) == "`"
|
||||
assert unhash(lex_of(tokens[1])) == open_
|
||||
assert unhash(lex_of(tokens[2])) == word_str
|
||||
assert unhash(lex_of(tokens[2])) == word_str
|
||||
assert unhash(lex_of(tokens[3])) == close_
|
||||
assert unhash(lex_of(tokens[4])) == "'"
|
30
tests/test_vocab.py
Normal file
30
tests/test_vocab.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy import lex_of
|
||||
from spacy.en import lookup
|
||||
from spacy.en import unhash
|
||||
|
||||
|
||||
def test_neq():
|
||||
addr = lookup('Hello')
|
||||
assert lookup('bye') != addr
|
||||
|
||||
|
||||
def test_eq():
|
||||
addr = lookup('Hello')
|
||||
assert lookup('Hello') == addr
|
||||
|
||||
|
||||
def test_round_trip():
|
||||
hello = lookup('Hello')
|
||||
assert unhash(lex_of(hello)) == 'Hello'
|
||||
|
||||
|
||||
def test_case_neq():
|
||||
addr = lookup('Hello')
|
||||
assert lookup('hello') != addr
|
||||
|
||||
|
||||
def test_punct_neq():
|
||||
addr = lookup('Hello')
|
||||
assert lookup('Hello,') != addr
|
Loading…
Reference in New Issue
Block a user