NLP(10)--TFIDF优劣势及其应用Demo

前言

仅记录学习过程，有问题欢迎讨论

TF*IDF：

在这里插入图片描述
优势：

可解释性好
可以清晰地看到关键词
即使预测结果出错，也很容易找到原因
计算速度快
分词本身占耗时最多，其余为简单统计计算
对标注数据依赖小
可以使用无标注语料完成一部分工作
可以与很多算法组合使用
可以看做是词权重

劣势：
1.受分词效果影响大

2.词与词之间没有语义相似度

3.没有语序信息（词袋模型）

4.能力范围有限，无法完成复杂任务，如机器翻译和实体挖掘等

5.样本不均衡会对结果有很大影响

6.类内样本间分布不被考虑

代码

Demo1：手动实现TFIDF

"""
实现一个 TFIDF
"""
import jieba
import math
import os
import json
from collections import defaultdict


def build_tf_idf_dict(corpus):
    tf_dict = defaultdict(dict)  # key:文档序号，value：dict，文档中每个词出现的频率
    idf_dict = defaultdict(set)  # key:词， value：set，文档序号，最终用于计算每个词在多少篇文档中出现过
    for text_index, text_words in enumerate(corpus):
        for word in text_words:
            if word not in tf_dict[text_index]:
                tf_dict[text_index][word] = 0
            tf_dict[text_index][word] += 1
            idf_dict[word].add(text_index)
    idf_dict = dict([(key, len(value)) for key, value in idf_dict.items()])
    return tf_dict, idf_dict


# 根据tf值和idf值计算tfidf
def calculate_tf_idf(tf_dict, idf_dict):
    tf_idf_dict = defaultdict(dict)
    for text_index, word_tf_count_dict in tf_dict.items():
        for word, tf_count in word_tf_count_dict.items():
            tf = tf_count / sum(word_tf_count_dict.values())
            # tf-idf = tf * log(D/(idf + 1))
            tf_idf_dict[text_index][word] = tf * math.log(len(tf_dict) / (idf_dict[word] + 1))
    return tf_idf_dict


# 计算样本的 tfidf
def calculate_tfidf(corpus):
    corpus = [jieba.cut(text) for text in corpus]
    tf_dict, idf_dict = build_tf_idf_dict(corpus)
    tf_idf_dict = calculate_tf_idf(tf_dict, idf_dict)
    return tf_idf_dict


# 取出前k个 tfidf最大的数据
def tf_idf_topk(tfidf_dict, paths=[], top=10, print_word=True):
    topk_dict = {}
    for text_index, text_tfidf_dict in tfidf_dict.items():
        # idf 逆序
        word_list = sorted(text_tfidf_dict.items(), reverse=True, key=lambda x: x[1])
        # 去排序后的前top个
        topk_dict[text_index] = word_list[:top]
        if print_word:
            print(text_index, paths[text_index])
            for i in range(top):
                print(word_list[i])
            print("----------")
    return topk_dict


def main():
    dir_path = r"week4/category_corpus/"
    corpus = []
    paths = []
    for path in os.listdir(dir_path):
        path = os.path.join(dir_path, path)
        if path.endswith("txt"):
            corpus.append(open(path, encoding="utf8").read())
            paths.append(os.path.basename(path))
    tf_idf_dict = calculate_tfidf(corpus)
    tf_idf_topk(tf_idf_dict, paths)


if __name__ == "__main__":
    main()

Demo2:利用 tfidf 实现简单搜索引擎功能

"""
利用 tfidf 实现简单搜索引擎功能

"""

import jieba
import math
import os
import json
from collections import defaultdict

# 加载文档数据（可以想象成网页数据），计算每个网页的tfidf字典
from day0429_1 import calculate_tfidf


def load_data(path):
    # path = "/week4/news.json"
    corpus = []
    with open(path, encoding="utf8") as f:
        documents = json.loads(f.read())
        for document in documents:
            corpus.append(document['title'] + "\n" + document["content"])
        tf_idf_dict = calculate_tfidf(corpus)
    return tf_idf_dict, corpus


def search_engine(query_str, tf_idf_dict, corpus, top=3):
    query_words = jieba.lcut(query_str)
    res = []
    for doc_id, tf_idf in tf_idf_dict.items():
        score = 0
        for word in query_words:
            # 搜到关键词了 score++
            score += tf_idf.get(word, 0)
        res.append([doc_id, score])
    res = sorted(res, reverse=True, key=lambda x: x[1])
    for i in range(top):
        doc_id = res[i][0]
        print(corpus[doc_id])
        print("--------------")
    return res

if __name__ == "__main__":
    path = "C:\\Users\\Administrator\\Desktop\\LearnPython\\week4\\news.json"
    tf_idf_dict, corpus = load_data(path)
    while True:
        query = input("请输入您要搜索的内容:")
        search_engine(query, tf_idf_dict, corpus)

Demo3 :基于tfidf实现简单文本摘要

import jieba
import math
import os
import random
import re
import json
from collections import defaultdict

from day0429_1 import calculate_tfidf

"""
基于tfidf实现简单文本摘要
"""


# 加载文档数据（可以想象成网页数据），计算每个网页的tfidf字典
def load_data(file_path):
    corpus = []
    with open(file_path, encoding="utf8") as f:
        documents = json.loads(f.read())
        for document in documents:
            assert "\n" not in document["title"]
            assert "\n" not in document["content"]
            corpus.append(document["title"] + "\n" + document["content"])
        tf_idf_dict = calculate_tfidf(corpus)
    return tf_idf_dict, corpus


# 计算每一篇文章的摘要
# 输入该文章的tf_idf词典，和文章内容
# top为人为定义的选取的句子数量
# 过滤掉一些正文太短的文章，因为正文太短在做摘要意义不大
def generate_document_abstract(document_tf_idf, document, top=3):
    sentences = re.split("？|！|。", document)
    if len(sentences) < 5:
        return None
    res = []
    for index, sentence in enumerate(sentences):
        sentence_score = 0
        words = jieba.lcut(sentence)
        for word in words:
            sentence_score += document_tf_idf.get(word, 0)
        # 记录下每句话的分数和下标
        res.append([sentence_score, index])
    res = sorted(res, reverse=True, key=lambda x: x[0])
    # 权重最高的可能依次是第10，第6，第3句，将他们调整为出现顺序比较合理，即3,6,10
    important_sentence_indexs = sorted([x[1] for x in res[:top]])
    return "。".join([sentences[index] for index in important_sentence_indexs])


# 生成摘要
def generate_abstract(tf_idf_dict, corpus):
    res = []
    for index, document_tf_idf in tf_idf_dict.items():
        title, content = corpus[index].split("\n")
        abstract = generate_document_abstract(document_tf_idf, content)
        if abstract is None:
            continue
        corpus[index] = "\n" + abstract
        res.append({"标题": title, "正文": content, "摘要": abstract})
    return res

if __name__ == "__main__":
    path = "C:\\Users\\Administrator\\Desktop\\LearnPython\\week4\\news.json"
    tf_idf_dict, corpus = load_data(path)
    res = generate_abstract(tf_idf_dict, corpus)
    writer = open("abstract.json", "w", encoding="utf8")
    writer.write(json.dumps(res, ensure_ascii=False, indent=2))
    writer.close()