1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
| import struct
class SouGo(object): """ sg=SouGo(path="./词表.seil") sg.save(path=new_path) """ start_pinyin = 0x1540 start_word = 0x2628 lexicon_name = slice(0x130, 0x338) lexicon_type = slice(0x338, 0x540) lexicon_describe = slice(0x540, 0xd40) lexicon_demo = slice(0xd40, start_pinyin)
def __init__(self, file_path): with open(file_path, 'rb') as f: self.data = f.read() self.pinyin = {} self.dictionary = [] self.init()
def init(self): self.info() self.build_pinyin(self.data[self.start_pinyin:self.start_word]) self.build_tokens(self.data[self.start_word:])
def info(self): def show(slice_): return self.bytes2str(self.data[slice_])
print("-" * 60 + "\n") print(f"词库名:{show(self.lexicon_name)}") print(f"词库类型:{show(self.lexicon_type)}") print(f"描述信息:{show(self.lexicon_describe)}") print(f"词库示例:{show(self.lexicon_demo)}") print("-" * 60 + "\n") print("\n") return None
def bytes2str(self, data): pos = 0 string = '' while pos < len(data): c = chr(self.unpack(data, pos)) if c != chr(0): string += c pos += 2 return string
@staticmethod def unpack(data, position): return struct.unpack('H', bytes([data[position], data[position + 1]]))[0]
def build_pinyin(self, data): """构建拼音表""" data = data[4:] pos = 0 while pos < len(data): index = self.unpack(data, pos) pos, pinyin = self._match(data, pos + 2, func=self.bytes2str) self.pinyin[index] = pinyin return self.pinyin
def get_pinyin(self, data): """获取拼音""" pos = 0 ret = '' while pos < len(data): index = self.unpack(data, pos) ret += self.pinyin[index] pos += 2 return ret
def _match(self, data, position, func): """抽象出来的逻辑,用于使用索引+长度的方式匹配数据""" length = self.unpack(data, position) position += 2 ret = func(data[position:position + length]) position += length return position, ret
def build_tokens(self, data): """构建词表""" pos = 0 while pos < len(data): same = self.unpack(data, pos) pos, pinyin = self._match(data, pos + 2, func=self.get_pinyin) for i in range(same): pos, word = self._match(data, pos, func=self.bytes2str) ext_len = self.unpack(data, pos) pos += 2 count = self.unpack(data, pos) self.dictionary.append((count, pinyin, word)) pos += ext_len return self.dictionary
def __call__(self, *args, **kwargs): return self.dictionary
def save(self, path, all_info=False): if all_info: dictionary = ["\t\t\t".join(map(str, i)) + "\n" for i in self.dictionary] else: dictionary = [f"{i[-1]}\n" for i in self.dictionary] with open(path, "w", encoding="utf-8") as f: f.writelines(dictionary)
def __len__(self): return len(self.dictionary)
|