Posted on 2006年9月13日 7:26
在Python脚本里需要实现中文分词,研究了以下方案:
结果:未发现有这样的模块
结果:估算自己不可能在一周内写出满足自己要求的模块
结果:由于不熟悉扩展库的编写,一周之内不可能完成
结果:使用ctypes折腾了两天调用该分词库成功,现贴出测试代码:
# -*- coding: cp936 -*-
from ctypes import *
class SHLSegWord(Structure):
_fields_ = [('s_szWord', c_char_p),
('s_dwPOS', c_long),
('s_fWeight', c_float)]
def main():
HLSplitInit = cdll.HLSSplit.HLSplitInit
HLFreeSplit = cdll.HLSSplit.HLFreeSplit
HLOpenSplit = cdll.HLSSplit.HLOpenSplit
HLCloseSplit = cdll.HLSSplit.HLCloseSplit
HLSplitWord = cdll.HLSSplit.HLSplitWord
HLGetWordCnt = cdll.HLSSplit.HLGetWordCnt
HLGetWordAt = cdll.HLSSplit.HLGetWordAt
HLGetWordAt.restype = POINTER(SHLSegWord)
#print HLSplitInit
#print HLFreeSplit
#print HLOpenSplit
#print HLCloseSplit
#print HLSplitWord
#print HLGetWordCnt
#print HLGetWordAt
INVALID_HANDLE_VALUE = -1
#strDictPath = 'HLSplitWord.dat'
#strDictPath = 'D:\\hlssplit\\Lib\\HLSplitWord.dat'
strDictPath = None
bInitDict = HLSplitInit(strDictPath)
if not bInitDict:
print '初始化分词字典失败!'
raise WinError()
hHandle = HLOpenSplit()
if INVALID_HANDLE_VALUE == hHandle:
print '创建分词句柄失败!'
raise WinError()
iExtraCalcFlag = 0
#lpText = '海量智能分词研究版调用测试'
lpText = '国务院一月二十五日举行春节团拜会, 胡锦涛主席走进会场代表中央政治局致词。'
#print type(lpText)
bSuccess = HLSplitWord(hHandle, lpText, iExtraCalcFlag)
if not bSuccess:
print '分词失败!'
HLCloseSplit(hHandle)
HLFreeSplit()
raise WinError()
nResultCnt = HLGetWordCnt(hHandle)
#print nResultCnt
for i in xrange(nResultCnt):
pWord = HLGetWordAt(hHandle, i)
print pWord[0].s_szWord
HLCloseSplit(hHandle)
HLFreeSplit()
if __name__ == '__main__':
main()
Technorati : Chinese Word Segmenter, Python, 中文分词