Pandas核对两组数据
程序员文章站
2022-03-03 20:42:31
...
现有一个excel表格,‘old’ tab和 ‘new’ tab各自放了旧的客户数据和新的客户数据,需要对比两个数据的不同。
*源数据表格为sample-addresses.xlsx
*程序运行完得到的表格为diff_result.xlsx
以下为代码:
import pandas as pd
import numpy as np
# Define the diff function to show the changes in each field
def report_diff(x):
return x[1] if x[1] == x[0] else '{} ---> {}'.format(x[1], x[0])
# Read in the two files but call the data old and new and create columns to track
old = pd.read_excel(r'C:\Users\Admin\Desktop\python_test\Excel\reconcile_project\project_0\sample-addresses.xlsx',
'old', na_values=['NA'])
new = pd.read_excel(r'C:\Users\Admin\Desktop\python_test\Excel\reconcile_project\project_0\sample-addresses.xlsx',
'new', na_values=['NA'])
old['version'] = "old"
new['version'] = "new"
# Join all the data together and ignore indexes so it all gets added
full_set = pd.concat([old, new], ignore_index=True)
# diff_nodupe is a DataFrame with no rows of duplicate account numbers
diff_nodupe = full_set.drop_duplicates(subset=['account number'], keep=False).set_index('account number')
# extract removed rows from old sheet
removed = diff_nodupe[diff_nodupe['version'] == 'old']
# extract newly added rows from new sheet
added = diff_nodupe[diff_nodupe['version'] == 'new']
# drop version column,axis = 1 means column, 0 means row
removed = removed.drop(['version'], axis=1)
added = added.drop(['version'], axis=1)
# Let's see what changes in the main columns we care about
changes = full_set.drop_duplicates(subset=["account number", "name", "street", "city", "state", "postal code"],
keep='last').set_index('account number')
# Get all duplicate index rows
dupes = changes[changes.index.duplicated(keep=False)]
# Pull out the old and new data into separate DataFrames
change_new = dupes[(dupes["version"] == "new")]
change_old = dupes[(dupes["version"] == "old")]
# Drop the temp columns - we don't need them now
change_new = change_new.drop(['version'], axis=1)
change_old = change_old.drop(['version'], axis=1)
# The following steps are to combine change_old & change_new, and transform it into 3D MultiIndex DataFrame(diff_panel)
# transform change_old from 2-d DataFrame to 1-d MultiIndex series
old_series = change_old.stack()
# transform change_new from 2-d DataFrame to 1-d MultiIndex series and then transform to 1-d MultiIndex DataFrame
diff_panel = change_new.stack().to_frame(name='new')
# add old_series to new_df new column old
diff_panel['old'] = old_series
# apply report_diff() to all rows
diff_output = diff_panel.apply(report_diff, axis=1)
# Transform MultiIndex DataFrame to single index DataFrame
diff_changed = diff_output.unstack()
# Put 3 DataFrames into excel 3 sheets
with pd.ExcelWriter(r'C:\Users\Admin\Desktop\python_test\Excel\reconcile_project\project_0\diff_result.xlsx') as writer:
diff_changed.to_excel(writer, sheet_name='changed')
removed.to_excel(writer, sheet_name='removed')
added.to_excel(writer, sheet_name='added')