python

关注公众号 jb51net

关闭
首页 > 脚本专栏 > python > Python文件相似度检测

基于Python实现一个文件相似度检测工具

作者:Yant224

本文介绍基于Python的文件相似度检测工具,支持余弦、Jaccard、编辑距离等算法,具备预处理、阈值判断和健壮性处理,适用于文档查重、代码检测及版本比较,需要的朋友可以参考下

基于文本内容比较的相似度检测工具

以下是一个完整的文件相似度检测函数实现,支持多种相似度算法和阈值判断:

import difflib
from pathlib import Path
import re
from collections import Counter
import math
import string

def are_files_similar(
    file1: str | Path,
    file2: str | Path,
    similarity_threshold: float = 0.8,
    method: str = "cosine"
) -> bool:
    """
    比较两个文件的相似度是否超过指定阈值
    
    参数:
        file1: 第一个文件路径
        file2: 第二个文件路径
        similarity_threshold: 相似度阈值 (0-1)
        method: 相似度计算方法
            'cosine' - 余弦相似度 (默认)
            'jaccard' - Jaccard相似度
            'levenshtein' - 编辑距离相似度
            'sequence' - 序列匹配相似度
    
    返回:
        bool: 相似度是否超过阈值
    """
    # 读取文件内容
    content1 = _read_file(file1)
    content2 = _read_file(file2)
    
    # 空文件处理
    if not content1 and not content2:
        return True  # 两个空文件视为相同
    
    # 选择计算方法
    if method == "cosine":
        similarity = _cosine_similarity(content1, content2)
    elif method == "jaccard":
        similarity = _jaccard_similarity(content1, content2)
    elif method == "levenshtein":
        similarity = _levenshtein_similarity(content1, content2)
    elif method == "sequence":
        similarity = _sequence_similarity(content1, content2)
    else:
        raise ValueError(f"未知的相似度计算方法: {method}")
    
    return similarity >= similarity_threshold

def _read_file(file_path: str | Path) -> str:
    """读取文件内容并进行预处理"""
    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"文件不存在: {path}")
    
    # 读取文件内容
    try:
        with open(path, 'r', encoding='utf-8') as f:
            content = f.read()
    except UnicodeDecodeError:
        # 尝试其他编码
        with open(path, 'r', encoding='latin-1') as f:
            content = f.read()
    
    # 基础预处理
    content = content.lower()
    content = re.sub(r'\s+', ' ', content)  # 合并连续空白
    return content.strip()

def _cosine_similarity(text1: str, text2: str) -> float:
    """计算余弦相似度"""
    # 创建词频向量
    vec1 = Counter(_tokenize(text1))
    vec2 = Counter(_tokenize(text2))
    
    # 获取所有唯一词
    words = set(vec1.keys()) | set(vec2.keys())
    
    # 创建向量
    vector1 = [vec1.get(word, 0) for word in words]
    vector2 = [vec2.get(word, 0) for word in words]
    
    # 计算点积
    dot_product = sum(v1 * v2 for v1, v2 in zip(vector1, vector2))
    
    # 计算模长
    magnitude1 = math.sqrt(sum(v**2 for v in vector1))
    magnitude2 = math.sqrt(sum(v**2 for v in vector2))
    
    # 避免除以零
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0
    
    return dot_product / (magnitude1 * magnitude2)

def _jaccard_similarity(text1: str, text2: str) -> float:
    """计算Jaccard相似度"""
    set1 = set(_tokenize(text1))
    set2 = set(_tokenize(text2))
    
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    
    if union == 0:
        return 1.0  # 两个空集
    
    return intersection / union

def _levenshtein_similarity(text1: str, text2: str) -> float:
    """基于编辑距离的相似度"""
    # 计算编辑距离
    n, m = len(text1), len(text2)
    if n == 0 or m == 0:
        return 0.0
    
    # 创建距离矩阵
    d = [[0] * (m + 1) for _ in range(n + 1)]
    
    # 初始化边界
    for i in range(n + 1):
        d[i][0] = i
    for j in range(m + 1):
        d[0][j] = j
    
    # 计算距离
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = 0 if text1[i - 1] == text2[j - 1] else 1
            d[i][j] = min(
                d[i - 1][j] + 1,      # 删除
                d[i][j - 1] + 1,      # 插入
                d[i - 1][j - 1] + cost  # 替换
            )
    
    distance = d[n][m]
    max_len = max(n, m)
    return 1 - (distance / max_len)

def _sequence_similarity(text1: str, text2: str) -> float:
    """基于序列匹配的相似度"""
    matcher = difflib.SequenceMatcher(None, text1, text2)
    return matcher.ratio()

def _tokenize(text: str) -> list[str]:
    """文本分词处理"""
    # 移除标点
    text = text.translate(str.maketrans('', '', string.punctuation))
    # 分词
    return text.split()

一、使用示例

1. 基本使用

# 比较两个文件是否相似度超过80%
result = are_files_similar("file1.txt", "file2.txt", 0.8)
print(f"文件相似: {result}")

2. 使用不同算法

# 使用Jaccard相似度
result = are_files_similar("doc1.md", "doc2.md", method="jaccard")

# 使用编辑距离相似度
result = are_files_similar("code1.py", "code2.py", method="levenshtein")

3. 批量比较

def find_similar_files(directory, threshold=0.9):
    """查找目录中相似的文件对"""
    from itertools import combinations
    files = list(Path(directory).glob("*"))
    
    similar_pairs = []
    for file1, file2 in combinations(files, 2):
        if are_files_similar(file1, file2, threshold):
            similar_pairs.append((file1.name, file2.name))
    
    return similar_pairs

二、功能特点

1. 多算法支持

算法适用场景特点
余弦相似度长文档、自然语言考虑词频,忽略词序
Jaccard相似度短文本、关键词匹配基于集合运算
编辑距离相似度代码、配置文件考虑字符级差异
序列匹配相似度通用文本Python内置算法

2. 预处理流程

  1. 统一小写
  2. 合并连续空白
  3. 移除标点符号(分词时)
  4. 标准化编码处理

3. 健壮性设计

三、性能优化建议

1. 大文件处理

def _read_large_file(file_path: Path) -> str:
    """分块读取大文件"""
    content = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        while True:
            chunk = f.read(65536)  # 64KB块
            if not chunk:
                break
            content.append(chunk.lower())
    return ' '.join(content)

2. 内存优化版Jaccard

def _jaccard_similarity_large(text1: str, text2: str) -> float:
    """适用于大文件的Jaccard相似度"""
    # 使用最小哈希(minHash)近似计算
    from datasketch import MinHash
    
    # 创建MinHash对象
    m1 = MinHash(num_perm=128)
    m2 = MinHash(num_perm=128)
    
    # 添加词元
    for word in set(_tokenize(text1)):
        m1.update(word.encode('utf-8'))
    
    for word in set(_tokenize(text2)):
        m2.update(word.encode('utf-8'))
    
    return m1.jaccard(m2)

3. 并行处理

from concurrent.futures import ThreadPoolExecutor

def batch_compare(file_pairs, threshold=0.8):
    """并行批量比较文件"""
    results = {}
    with ThreadPoolExecutor() as executor:
        futures = {
            (pair[0].name, pair[1].name): executor.submit(
                are_files_similar, pair[0], pair[1], threshold
            )
            for pair in file_pairs
        }
        
        for names, future in futures.items():
            results[names] = future.result()
    
    return results

四、应用场景

1. 文档查重

def check_plagiarism(submitted_file, source_files, threshold=0.7):
    """检查文档抄袭"""
    for source in source_files:
        if are_files_similar(submitted_file, source, threshold):
            print(f"检测到与 {source} 相似")
            return True
    return False

2. 代码相似度检测

def detect_code_clones(repo_path):
    """检测代码库中的相似代码片段"""
    code_files = list(Path(repo_path).rglob("*.py"))
    clones = []
    
    for file1, file2 in combinations(code_files, 2):
        if are_files_similar(file1, file2, 0.85, method="levenshtein"):
            clones.append((file1, file2))
    
    return clones

3. 文件版本比较

def find_most_similar_version(target_file, versions):
    """在多个版本中查找最相似的文件"""
    similarities = []
    for version_file in versions:
        sim = are_files_similar(target_file, version_file, method="sequence")
        similarities.append((version_file, sim))
    
    # 按相似度排序
    return sorted(similarities, key=lambda x: x[1], reverse=True)[0]

五、测试用例

import unittest
import tempfile

class TestFileSimilarity(unittest.TestCase):
    def setUp(self):
        # 创建临时文件
        self.file1 = tempfile.NamedTemporaryFile(delete=False, mode='w+')
        self.file2 = tempfile.NamedTemporaryFile(delete=False, mode='w+')
        self.file3 = tempfile.NamedTemporaryFile(delete=False, mode='w+')
        
        # 写入内容
        self.file1.write("This is a test file for similarity comparison.")
        self.file2.write("This is a test file for similarity comparison.")
        self.file3.write("This is a completely different file content.")
        
        # 确保写入磁盘
        self.file1.flush()
        self.file2.flush()
        self.file3.flush()
    
    def test_identical_files(self):
        self.assertTrue(are_files_similar(self.file1.name, self.file2.name))
    
    def test_different_files(self):
        self.assertFalse(are_files_similar(self.file1.name, self.file3.name, 0.8))
    
    def test_empty_files(self):
        with tempfile.NamedTemporaryFile(mode='w+') as empty1, \
             tempfile.NamedTemporaryFile(mode='w+') as empty2:
            self.assertTrue(are_files_similar(empty1.name, empty2.name))
    
    def test_various_methods(self):
        # 相同文件应所有方法都返回高相似度
        self.assertAlmostEqual(
            are_files_similar(self.file1.name, self.file2.name, 0.0, "cosine"), 
            1.0, delta=0.01
        )
        self.assertAlmostEqual(
            are_files_similar(self.file1.name, self.file2.name, 0.0, "jaccard"), 
            1.0, delta=0.01
        )
        self.assertAlmostEqual(
            are_files_similar(self.file1.name, self.file2.name, 0.0, "levenshtein"), 
            1.0, delta=0.01
        )
        self.assertAlmostEqual(
            are_files_similar(self.file1.name, self.file2.name, 0.0, "sequence"), 
            1.0, delta=0.01
        )
    
    def tearDown(self):
        # 清理临时文件
        Path(self.file1.name).unlink()
        Path(self.file2.name).unlink()
        Path(self.file3.name).unlink()

if __name__ == "__main__":
    unittest.main()

总结

这个文件相似度检测函数提供了:

  1. 多种算法选择:余弦、Jaccard、编辑距离、序列匹配
  2. 阈值判断:灵活设置相似度阈值
  3. 健壮性处理:编码处理、空文件处理
  4. 易用接口:支持字符串和Path对象

使用示例:

# 基本使用
result = are_files_similar("file1.txt", "file2.txt", 0.75)

# 指定算法
result = are_files_similar("doc1.md", "doc2.md", method="jaccard")

通过这个函数,您可以轻松实现:

以上就是基于Python实现一个文件相似度检测工具的详细内容,更多关于Python文件相似度检测的资料请关注脚本之家其它相关文章!

您可能感兴趣的文章:
阅读全文