python筛选出两个文件中重复行的方法
程序员文章站
2023-01-05 14:25:38
本文实例为大家分享了python脚本筛选出两个文件中重复的行数,供大家参考,具体内容如下
'''
查找a文件中,与b文件中内容不重复的内容
'''
#!us...
本文实例为大家分享了python脚本筛选出两个文件中重复的行数,供大家参考,具体内容如下
''' 查找a文件中,与b文件中内容不重复的内容 ''' #!usr/bin/python import sys import os ''' 字符串查找函数,使用二分查找法在列表中进行查询 ''' def binarysearch(value, lines): right = len(lines) - 1 left = 0 a = value.strip() while left <= right: middle = int((right + left + 1)/2) b = lines[middle].strip() if a == b: return 1 if a < b: right = middle - 1 else: left = middle + 1 return 0 dpt = 100000 # dpt 是data per file的意思 fileaname = sys.argv[1]; filebname = sys.argv[2]; #step1:先拆掉b文件,作为比较基准,临时文件命名为temp1,temp2,...,tempn print("拆分比对文件...\n") fb = open(filebname) tempfileno = 1 tempfilename = "temp{0}".format(tempfileno) ftemp = open(tempfilename, "w+") line = fb.readline() linecount = 0 while line: if linecount >= dpt: ftemp.flush() ftemp.close() tempfileno = tempfileno + 1 tempfilename = "temp{0}".format(tempfileno) ftemp = open(tempfilename, "w+") linecount = 0 ftemp.write(line) linecount = linecount + 1 line = fb.readline() ftemp.flush() ftemp.close() fb.close() print("拆分完成,一共{0}个临时文件,{1}条数据。\n".format(tempfileno, (tempfileno-1)*dpt + linecount)) #step2:把a文件与b文件拆出来的临时文件逐个进行比较,将结果轮流写入文件result0, result1 # 最后写入的result文件就是最终结果 fa = open(fileaname) resulttempfile = {"result0", "result1"}; tempindex = 0 fout = open("repeat", "w+") repeatcount = 0 for i in range(1, tempfileno + 1): print("比较第{0}个临时文件...\n".format(i)) if 0 == tempindex: resulttempfile = "result0" tempindex = 1 else: resulttempfile = "result1" tempindex = 0 fresult = open(resulttempfile, "w+") ftemp = open("temp{0}".format(i)) lineset = ftemp.readlines() ftemp.close() linelist = list(lineset) linelist.sort() line = fa.readline() while line: if 0 == binarysearch(line, linelist): fresult.write(line) else: fout.write(line) repeatcount = repeatcount + 1 line = fa.readline() fa.close() fresult.flush() fresult.close() fa = open(resulttempfile) fa.close() fout.flush() fout.close() print("比较完成,重复数据{0}条".format(repeatcount)) os.rename(resulttempfile, "result") #step3:结束后把临时文件都删掉 print("删除临时文件...\n") while tempfileno > 0: tempfilename = "temp{0}".format(tempfileno) os.remove(tempfilename) tempfileno = tempfileno - 1 print("脚本结束。\n")
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。