mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
aec130af56
Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download
95 lines
2.6 KiB
Python
95 lines
2.6 KiB
Python
from __future__ import unicode_literals, print_function
|
|
from os import path
|
|
import codecs
|
|
|
|
try:
|
|
import ujson as json
|
|
except ImportError:
|
|
import json
|
|
|
|
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
|
|
from .util import MockPackage as Package
|
|
|
|
|
|
class Lemmatizer(object):
|
|
@classmethod
|
|
def load(cls, pkg_or_str_or_file):
|
|
pkg = Package.create_or_return(pkg_or_str_or_file)
|
|
index = {}
|
|
exc = {}
|
|
for pos in ['adj', 'noun', 'verb']:
|
|
with pkg.open(('wordnet', 'index.%s' % pos), default=None) as file_:
|
|
index[pos] = read_index(file_) if file_ is not None else set()
|
|
with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_:
|
|
exc[pos] = read_exc(file_) if file_ is not None else {}
|
|
with pkg.open(('vocab', 'lemma_rules.json'), default=None) as file_:
|
|
rules = json.load(file_) if file_ is not None else {}
|
|
return cls(index, exc, rules)
|
|
|
|
def __init__(self, index, exceptions, rules):
|
|
self.index = index
|
|
self.exc = exceptions
|
|
self.rules = rules
|
|
|
|
def __call__(self, string, pos):
|
|
if pos == NOUN:
|
|
pos = 'noun'
|
|
elif pos == VERB:
|
|
pos = 'verb'
|
|
elif pos == ADJ:
|
|
pos = 'adj'
|
|
elif pos == PUNCT:
|
|
pos = 'punct'
|
|
lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, []))
|
|
return lemmas
|
|
|
|
def noun(self, string):
|
|
return self(string, 'noun')
|
|
|
|
def verb(self, string):
|
|
return self(string, 'verb')
|
|
|
|
def adj(self, string):
|
|
return self(string, 'adj')
|
|
|
|
def punct(self, string):
|
|
return self(string, 'punct')
|
|
|
|
|
|
def lemmatize(string, index, exceptions, rules):
|
|
string = string.lower()
|
|
forms = []
|
|
if string in index:
|
|
forms.append(string)
|
|
forms.extend(exceptions.get(string, []))
|
|
for old, new in rules:
|
|
if string.endswith(old):
|
|
form = string[:len(string) - len(old)] + new
|
|
if form in index or not form.isalpha():
|
|
forms.append(form)
|
|
if not forms:
|
|
forms.append(string)
|
|
return set(forms)
|
|
|
|
|
|
def read_index(fileobj):
|
|
index = set()
|
|
for line in fileobj:
|
|
if line.startswith(' '):
|
|
continue
|
|
pieces = line.split()
|
|
word = pieces[0]
|
|
if word.count('_') == 0:
|
|
index.add(word)
|
|
return index
|
|
|
|
|
|
def read_exc(fileobj):
|
|
exceptions = {}
|
|
for line in fileobj:
|
|
if line.startswith(' '):
|
|
continue
|
|
pieces = line.split()
|
|
exceptions[pieces[0]] = tuple(pieces[1:])
|
|
return exceptions
|