mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Add POS utilities
This commit is contained in:
		
							parent
							
								
									5ebe14f353
								
							
						
					
					
						commit
						224bdae996
					
				
							
								
								
									
										55
									
								
								spacy/pos_util.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								spacy/pos_util.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,55 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
from . import util
 | 
			
		||||
from . import tokens
 | 
			
		||||
from .en import EN
 | 
			
		||||
 | 
			
		||||
from .pos import Tagger
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def realign_tagged(token_rules, tagged_line, sep='/'):
 | 
			
		||||
    words, pos = zip(*[token.rsplit(sep, 1) for token in tagged_line.split()])
 | 
			
		||||
    positions = util.detokenize(token_rules, words)
 | 
			
		||||
    aligned = []
 | 
			
		||||
    for group in positions:
 | 
			
		||||
        w_group = [words[i] for i in group]
 | 
			
		||||
        p_group = [pos[i] for i in group]
 | 
			
		||||
        aligned.append('<SEP>'.join(w_group) + sep + '_'.join(p_group))
 | 
			
		||||
    return ' '.join(aligned)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_tagged(detoken_rules, file_, sep='/'):
 | 
			
		||||
    sentences = []
 | 
			
		||||
    for line in file_:
 | 
			
		||||
        line = realign_tagged(detoken_rules, line, sep=sep)
 | 
			
		||||
        tokens, tags = _parse_line(line, sep)
 | 
			
		||||
        assert len(tokens) == len(tags)
 | 
			
		||||
        sentences.append((tokens, tags))
 | 
			
		||||
    return sentences
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _parse_line(line, sep):
 | 
			
		||||
    words = []
 | 
			
		||||
    tags = []
 | 
			
		||||
    for token_str in line.split():
 | 
			
		||||
        word, pos = token_str.rsplit(sep, 1)
 | 
			
		||||
        word = word.replace('<SEP>', '')
 | 
			
		||||
        subtokens = EN.tokenize(word)
 | 
			
		||||
        subtags = pos.split('_')
 | 
			
		||||
        while len(subtags) < len(subtokens):
 | 
			
		||||
            subtags.append('NULL')
 | 
			
		||||
        assert len(subtags) == len(subtokens), [t.string for t in subtokens]
 | 
			
		||||
        words.append(word)
 | 
			
		||||
        tags.extend([Tagger.encode_pos(pos) for pos in subtags])
 | 
			
		||||
    return EN.tokenize(' '.join(words)), tags
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_tagdict(train_sents):
 | 
			
		||||
    tagdict = {}
 | 
			
		||||
    for tokens, tags in train_sents:
 | 
			
		||||
        for i, tag in enumerate(tags):
 | 
			
		||||
            if tag == 'NULL':
 | 
			
		||||
                continue
 | 
			
		||||
            word = tokens.string(i)
 | 
			
		||||
            tagdict.setdefault(word, {}).setdefault(tag, 0)
 | 
			
		||||
            tagdict[word][tag] += 1
 | 
			
		||||
    return tagdict
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user