以下是针对两个字符串列表(String类型的List)计算相似度并判定是否为Direct Mapping的完整解决方案:
一、计算两列字符串列表相似度的5种核心方法
- 简单词法相似度
from collections import Counter import numpy as np def lexical_similarity(list1, list2): # 词频统计 c1 = Counter(' '.join(list1).lower().split()) c2 = Counter(' '.join(list2).lower().split()) # 计算Jaccard相似度 intersection = len(list(set(c1) & set(c2))) union = len(list(set(c1) | set(c2))) jaccard = intersection / union if union > 0 else 0 # 计算余弦相似度 all_words = list(set(c1.keys()) | set(c2.keys())) vec1 = np.array([c1.get(word, 0) for word in all_words]) vec2 = np.array([c2.get(word, 0) for word in all_words]) cosine = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) return {'jaccard': jaccard, 'cosine': cosine}
- 序列匹配相似度
from difflib import SequenceMatcher def sequence_similarity(list1, list2): ratios = [] for s1, s2 in zip(list1, list2): ratios.append(SequenceMatcher(None, str(s1), str(s2)).ratio()) return { 'mean_ratio': np.mean(ratios), 'min_ratio': min(ratios), 'distribution': pd.Series(ratios).describe().to_dict() }
- 语义嵌入相似度(需安装sentence-transformers)
from sentence_transformers import SentenceTransformer model = SentenceTransformer('paraphrase-MiniLM-L6-v2') def semantic_similarity(list1, list2): emb1 = model.encode(list1) emb2 = model.encode(list2) sim_scores = [np.dot(e1, e2)/(np.linalg.norm(e1)*np.linalg.norm(e2)) for e1, e2 in zip(emb1, emb2)] return { 'mean_semantic': np.mean(sim_scores), 'semantic_distribution': pd.Series(sim_scores).describe().to_dict() }
- 编辑距离相似度
import textdistance def edit_similarity(list1, list2): scores = [] for s1, s2 in zip(list1, list2): norm_dist = textdistance.levenshtein.normalized_similarity(str(s1), str(s2)) scores.append(norm_dist) return { 'mean_edit': np.mean(scores), 'edit_distribution': pd.Series(scores).describe().to_dict() }
- 集合包含度分析
def containment_analysis(list1, list2): set1, set2 = set(list1), set(list2) return { 'list1_in_list2': len(set1 & set2)/len(set1) if set1 else 0, 'list2_in_list1': len(set1 & set2)/len(set2) if set2 else 0, 'intersection': list(set1 & set2)[:10] # 展示前10个共同元素 }
二、判定Direct Mapping的综合标准
- 硬性条件(必须全部满足)
条件阈值/要求检查方法长度一致性两列长度完全相同len(list1) == len(list2)NULL值分布一致NULL位置相同率≥95%比较对应位置的NULL情况数据类型一致性均为字符串类型all(isinstance(x, str) for x in list1)
- 相似度阈值(需满足至少2项)
指标Direct Mapping阈值适用场景Jaccard相似度≥0.85词汇集合比较余弦相似度≥0.9词频分布比较序列匹配均值≥0.8字符串逐行比对语义相似度≥0.75深层语义匹配编辑距离均值≥0.85拼写差异检查
- 综合判定函数
def is_direct_mapping(list1, list2): # 硬性条件检查 if len(list1) != len(list2): return False # NULL值检查 null_pos1 = {i for i, x in enumerate(list1) if pd.isna(x) or x == ''} null_pos2 = {i for i, x in enumerate(list2) if pd.isna(x) or x == ''} if len(null_pos1.symmetric_difference(null_pos2)) / len(list1) > 0.05: return False # 计算各维度相似度 metrics = {} metrics.update(lexical_similarity(list1, list2)) metrics.update(sequence_similarity(list1, list2)) metrics.update(semantic_similarity(list1, list2)) metrics.update(edit_similarity(list1, list2)) # 阈值判定 passed = 0 if metrics['jaccard'] >= 0.85: passed += 1 if metrics['cosine'] >= 0.9: passed += 1 if metrics['mean_ratio'] >= 0.8: passed += 1 if metrics['mean_semantic'] >= 0.75: passed += 1 if metrics['mean_edit'] >= 0.85: passed += 1 return passed >= 2 and min(metrics.values()) > 0.5 # 防止单项极高但其他项极低
三、完整工作流程示例
import pandas as pd # 示例数据 list1 = ["Apple iPhone 13", "Samsung Galaxy S21", None, "华为Mate 50"] list2 = ["iPhone 13", "Galaxy S21 5G", "", "HUAWEI Mate50"] # 步骤1:数据预处理 def preprocess(lst): return [str(x).lower().strip() if pd.notna(x) and x != '' else None for x in lst] clean1, clean2 = preprocess(list1), preprocess(list2) # 步骤2:执行所有相似度计算 similarity_report = {} similarity_report.update(lexical_similarity(clean1, clean2)) similarity_report.update(sequence_similarity(clean1, clean2)) similarity_report.update(semantic_similarity(clean1, clean2)) similarity_report.update(edit_similarity(clean1, clean2)) similarity_report.update(containment_analysis(clean1, clean2)) # 步骤3:判定Direct Mapping is_direct = is_direct_mapping(clean1, clean2) print(f"是否Direct Mapping: {is_direct}") print("相似度报告:") for k, v in similarity_report.items(): print(f"{k:>20}: {str(v)[:80]}...")
四、不同场景的阈值调整建议
场景特征调整方向示例阈值组合高精度匹配提高语义权重Jaccard≥0.9 + 语义≥0.85容错匹配降低编辑距离要求编辑距离≥0.7 + 余弦≥0.8多语言数据强化语义分析语义≥0.8 + Jaccard≥0.6短文本集合提高序列匹配权重序列匹配≥0.9 + 编辑≥0.9包含专业术语添加自定义词典结合领域词典增强语义分析
五、可视化分析工具
import matplotlib.pyplot as plt def visualize_similarity(sim_report): metrics = ['jaccard', 'cosine', 'mean_ratio', 'mean_semantic', 'mean_edit'] values = [sim_report[k] for k in metrics] plt.figure(figsize=(10, 5)) bars = plt.bar(metrics, values, color=['#4CAF50' if v >= 0.8 else '#F44336' for v in values]) plt.axhline(y=0.8, color='gray', linestyle='--') plt.title('Similarity Metrics Comparison') plt.ylim(0, 1) for bar in bars: height = bar.get_height() plt.text(bar.get_x() + bar.get_width()/2., height, f'{height:.2f}', ha='center', va='bottom') plt.savefig('similarity_metrics.png')
六、性能优化技巧
采样计算(适用于大数据):
def sampled_similarity(list1, list2, sample_frac=0.3): sample_size = int(len(list1) * sample_frac) indices = np.random.choice(len(list1), sample_size, replace=False) sample1 = [list1[i] for i in indices] sample2 = [list2[i] for i in indices] return semantic_similarity(sample1, sample2)
并行计算:
from concurrent.futures import ThreadPoolExecutor def parallel_edit_distance(list1, list2): with ThreadPoolExecutor() as executor: results = list(executor.map( lambda x: textdistance.levenshtein.normalized_similarity(x[0], x[1]), zip(list1, list2) )) return np.mean(results)
缓存模型:
from functools import lru_cache @lru_cache(maxsize=1000) def cached_embedding(text): return model.encode(text)
通过以上方法组合,可以准确判断两个字符串列表是否构成Direct Mapping。实际应用中建议:
先进行快速筛选(如长度/NULL检查)
再执行轻量级计算(词法/编辑距离)
最后对候选列进行语义验证
关键业务场景建议人工复核边界案例(相似度在0.7-0.9之间的)