首页 > 编程 > Python > 正文

Python探索之pLSA实现代码

2020-02-16 10:27:49
字体:
来源:转载
供稿:网友

pLSA(probabilistic Latent Semantic Analysis),概率潜在语义分析模型,是1999年Hoffman提出的一个被称为第一个能解决一词多义问题的模型,通过在文档与单词之间建立一层主题(Topic),将文档与单词的直接关联转化为文档与主题的关联以及主题与单词的关联。这里采用EM算法进行估计,可能存在差错,望积极批评指正。

# -*- coding: utf-8 -*-import mathimport randomimport jiebaimport codecsimport datetimeclass pLSA_lph():  def __init__(self, ntopic = 5):    self.n_doc = 0    self.n_word = 0    self.n_topic = ntopic    self.corpus = None    self.p_z_dw = None    self.p_w_z = None    self.p_z_d = None    self.likelihood = 0    self.vocab = None    self.stop_words = [u',', u'。', u'、', u'(', u')', u'·', u'!', u' ', u':', u'“', u'”', u'/n']  # 每行和为1的正实数,概率分布;  def _rand_mat(self, sizex, sizey):    ret = []    for i in xrange(sizex):      ret.append([])      for _ in xrange(sizey):        ret[-1].append(random.random())      norm = sum(ret[-1])      for j in xrange(sizey):        ret[-1][j] /= norm    return ret  #从文本中计算词频稀疏矩阵,这里存储模型仿照LDA  def loadCorpus(self, fn):    # 中文分词    f = open(fn, 'r')    text = f.readlines()    text = r' '.join(text)    seg_generator = jieba.cut(text)    seg_list = [i for i in seg_generator if i not in self.stop_words]    seg_list = r' '.join(seg_list)    # 切割统计所有出现的词纳入词典    seglist = seg_list.split(" ")    self.vocab = []    for word in seglist:      if (word != u' ' and word not in self.vocab):        self.vocab.append(word)    self.n_word =len(self.vocab)    CountMatrix = []    f.seek(0, 0)    # 统计每个文档中出现的词频    for line in f:      # 置零      count = [0 for i in range(len(self.vocab))]      text = line.strip()      # 但还是要先分词      seg_generator = jieba.cut(text)      seg_list = [i for i in seg_generator if i not in self.stop_words]      seg_list = r' '.join(seg_list)      seglist = seg_list.split(" ")      # 查询词典中的词出现的词频      for word in seglist:        if word in self.vocab:          count[self.vocab.index(word)] += 1      CountMatrix.append(count)    f.close()    self.corpus = CountMatrix    self.n_doc = len(CountMatrix)    #初始化    self.p_z_d = self._rand_mat(self.n_topic, self.n_doc)    self.p_w_z = self._rand_mat(self.n_word, self.n_topic)    self.p_z_dw =[]    for k in range(self.n_topic):      self.p_z_dw.append(self._rand_mat(self.n_doc, self.n_word))  def _e_step(self):    for k in range(self.n_topic):      for d in range(self.n_doc):        for j in range(self.n_word):          _d_wz_zd = 0          for kk in range(self.n_topic):            _d_wz_zd += self.p_w_z[j][kk]*self.p_z_d[kk][d]          if _d_wz_zd <= 0:            _d_wz_zd = 1e-6          self.p_z_dw[k][d][j] = self.p_w_z[j][k]*self.p_z_d[k][d]/_d_wz_zd  def _m_step(self):    print "updating Pn(Wj|Zk).../r"    for j in range(self.n_word):      for k in range(self.n_topic):        _d_dw_zdw = 0        for d in range(self.n_doc):          _d_dw_zdw += self.corpus[d][j]*self.p_z_dw[k][d][j]        _d_dw_zdw_sum = 0        for jj in range(self.n_word):          _d_dw_zdw_i = 0          for d in range(self.n_doc):            _d_dw_zdw_i += self.corpus[d][jj]*self.p_z_dw[k][d][jj]          _d_dw_zdw_sum += _d_dw_zdw_i        if _d_dw_zdw_sum <= 0:          _d_dw_zdw_sum = 1e-6        self.p_w_z[j][k] = _d_dw_zdw/_d_dw_zdw_sum    print "updating Pn(Zk|Di).../r"    for k in range(self.n_topic):      for d in range(self.n_doc):        _d_dw_zdw = 0        for j in range(self.n_word):          _d_dw_zdw += self.corpus[d][j]*self.p_z_dw[k][d][j]        _d_dw_zdw_sum = 0        for kk in range(self.n_topic):          _d_dw_zdw_i = 0          for j in range(self.n_word):            _d_dw_zdw_i += self.corpus[d][j]*self.p_z_dw[kk][d][j]          _d_dw_zdw_sum += _d_dw_zdw_i        if _d_dw_zdw_sum <= 0:          _d_dw_zdw_sum = 1e-6        self.p_z_d[k][d] = _d_dw_zdw/_d_dw_zdw_sum  #计算最大似然值  def _cal_max_likelihood(self):    self.likelihood = 0    for d in range(self.n_doc):      for j in range(self.n_word):        _dP_wjdi = 0        for k in range(self.n_topic):          _dP_wjdi += self.p_w_z[j][k]*self.p_z_d[k][d]        _dP_wjdi = 1.0/self.n_doc*_dP_wjdi        self.likelihood += self.corpus[d][j]*math.log(_dP_wjdi)  #迭代训练  def train(self, n_iter = 100, d_delta = 1e-6,log_fn = "log.log"):    itr = 0    delta =10e9    _likelihood = 0    f = open(log_fn, 'w')    while itr < n_iter and delta > d_delta:      _likelihood = self.likelihood      self._e_step()      self._m_step()      self._cal_max_likelihood()      itr += 1      delta = abs(self.likelihood - _likelihood)      t1 = datetime.datetime.now().strftime('%Y-%m-%d-%y %H:%M:%S');      f.write("%s iteration %d, max-likelihood = %.6f/n"%(t1, itr, self.likelihood))      print "%s iteration %d, max-likelihood = %.6f"%(t1, itr, self.likelihood)    f.close()  def printVocabulary(self):    print "vocabulary:"    for word in self.vocab:      print word,    print  def saveVocabulary(self, fn):    f = codecs.open(fn, 'w', 'utf-8')    for word in self.vocab:      f.write("%s/n"%word)    f.close()  def printWordOfTopic(self):    for k in range(self.n_topic):      print "Topic %d"%k,      for j in range(self.n_word):        print self.p_w_z[j][k],      print  def saveWordOfTopic(self,fn):    f = open(fn, 'w')    for j in range(self.n_word):      f.write(", w%d"%j)    f.write("/n")    for k in range(self.n_topic):      f.write("topic %d"%k)      for j in range(self.n_word):        f.write(", %.6f"%self.p_w_z[j][k])      f.write("/n")    f.close()  def printTopicOfDoc(self):    for d in range(self.n_doc):      print "Doc %d"%d,      for k in range(self.n_topic):        print self.p_z_d[k][d],      print  def saveTopicOfDoc(self, fn):    f = open(fn, 'w')    for k in range(self.n_topic):      f.write(", z%d" % k)    f.write("/n")    for d in range(self.n_doc):      f.write("doc %d" % d)      for k in range(self.n_topic):        f.write(", %.6f" % self.p_z_d[k][d])      f.write("/n")    f.close()            
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表