mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
* Reading in tokenization rules correctly. Passing tests.
This commit is contained in:
parent
9bef797afe
commit
4e79446dc2
1395
spacy/en.cpp
1395
spacy/en.cpp
File diff suppressed because it is too large
Load Diff
27
spacy/en.pyx
27
spacy/en.pyx
|
@ -10,6 +10,7 @@ from libc.stdint cimport uint64_t
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from ext.murmurhash cimport MurmurHash64A
|
from ext.murmurhash cimport MurmurHash64A
|
||||||
from ext.murmurhash cimport MurmurHash64B
|
from ext.murmurhash cimport MurmurHash64B
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
STRINGS = {}
|
STRINGS = {}
|
||||||
|
@ -20,6 +21,23 @@ LEXEMES.set_empty_key(0)
|
||||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
|
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
|
||||||
|
|
||||||
|
|
||||||
|
def load_tokenization(token_rules):
|
||||||
|
cdef Lexeme* word
|
||||||
|
cdef StringHash hashed
|
||||||
|
for chunk, lex, tokens in token_rules:
|
||||||
|
hashed = hash_string(chunk, len(chunk))
|
||||||
|
assert LEXEMES[hashed] == NULL
|
||||||
|
word = _add(hashed, lex, len(lex), len(lex))
|
||||||
|
for i, lex in enumerate(tokens):
|
||||||
|
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
|
||||||
|
length = len(token_string)
|
||||||
|
hashed = hash_string(token_string, length)
|
||||||
|
word.tail = _add(hashed, lex, 0, len(lex))
|
||||||
|
word = word.tail
|
||||||
|
|
||||||
|
|
||||||
|
load_tokenization(util.read_tokenization('en'))
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||||
'''.. function:: enumerate(sequence[, start=0])
|
'''.. function:: enumerate(sequence[, start=0])
|
||||||
Fetch a Lexeme representing a word string. If the word has not been seen,
|
Fetch a Lexeme representing a word string. If the word has not been seen,
|
||||||
|
@ -156,8 +174,8 @@ cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
|
||||||
cdef size_t _find_split(unicode word, size_t length):
|
cdef size_t _find_split(unicode word, size_t length):
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
# Contractions
|
# Contractions
|
||||||
if word == "'s":
|
if word.endswith("'s"):
|
||||||
return 2
|
return length - 2
|
||||||
# Leading punctuation
|
# Leading punctuation
|
||||||
if is_punct(word, 0, length):
|
if is_punct(word, 0, length):
|
||||||
return 1
|
return 1
|
||||||
|
@ -166,11 +184,8 @@ cdef size_t _find_split(unicode word, size_t length):
|
||||||
i = length - 1
|
i = length - 1
|
||||||
while i >= 2 and is_punct(word, i-1, length):
|
while i >= 2 and is_punct(word, i-1, length):
|
||||||
i -= 1
|
i -= 1
|
||||||
else:
|
|
||||||
# Doesn't start or end with the punct
|
|
||||||
while i < length and not is_punct(word, i, length):
|
|
||||||
i += 1
|
|
||||||
return i
|
return i
|
||||||
|
|
||||||
|
|
||||||
cdef bint is_punct(unicode word, size_t i, size_t length):
|
cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||||
return not word[i].isalnum()
|
return not word[i].isalnum()
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */
|
/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */
|
||||||
|
|
||||||
#define PY_SSIZE_T_CLEAN
|
#define PY_SSIZE_T_CLEAN
|
||||||
#ifndef CYTHON_USE_PYLONG_INTERNALS
|
#ifndef CYTHON_USE_PYLONG_INTERNALS
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */
|
/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */
|
||||||
|
|
||||||
#define PY_SSIZE_T_CLEAN
|
#define PY_SSIZE_T_CLEAN
|
||||||
#ifndef CYTHON_USE_PYLONG_INTERNALS
|
#ifndef CYTHON_USE_PYLONG_INTERNALS
|
||||||
|
|
|
@ -1,3 +1,10 @@
|
||||||
|
import os
|
||||||
|
from os import path
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
||||||
|
|
||||||
|
|
||||||
def utf8open(loc, mode='r'):
|
def utf8open(loc, mode='r'):
|
||||||
return codecs.open(loc, mode, 'utf8')
|
return codecs.open(loc, mode, 'utf8')
|
||||||
|
|
||||||
|
@ -12,23 +19,23 @@ def load_case_stats(data_dir):
|
||||||
return case_stats
|
return case_stats
|
||||||
|
|
||||||
|
|
||||||
def load_clitics(data_dir):
|
def read_tokenization(lang):
|
||||||
clitics_loc = path.join(data_dir, 'clitics.txt')
|
loc = path.join(DATA_DIR, lang, 'tokenization')
|
||||||
entries = []
|
entries = []
|
||||||
seen = set()
|
seen = set()
|
||||||
with utf8open(clitics_loc) as clitics_file:
|
with utf8open(loc) as file_:
|
||||||
for line in clitics_file:
|
for line in file_:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line.startswith('#'):
|
if line.startswith('#'):
|
||||||
continue
|
continue
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
clitics = line.split()
|
pieces = line.split()
|
||||||
word = clitics.pop(0)
|
chunk = pieces.pop(0)
|
||||||
norm_form = clitics.pop(0)
|
lex = pieces.pop(0)
|
||||||
assert word not in seen, word
|
assert chunk not in seen, chunk
|
||||||
seen.add(word)
|
seen.add(chunk)
|
||||||
entries.append((word, norm_form, clitics))
|
entries.append((chunk, lex, pieces))
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -28,3 +28,10 @@ def test_case_neq():
|
||||||
def test_punct_neq():
|
def test_punct_neq():
|
||||||
addr = lookup('Hello')
|
addr = lookup('Hello')
|
||||||
assert lookup('Hello,') != addr
|
assert lookup('Hello,') != addr
|
||||||
|
|
||||||
|
|
||||||
|
def test_short():
|
||||||
|
addr = lookup('I')
|
||||||
|
assert unhash(lex_of(addr)) == 'I'
|
||||||
|
addr = lookup('not')
|
||||||
|
assert unhash(lex_of(addr)) == 'not'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user