如何使用python 和excel 对100M左右的数据进行保存和简单分析
程序员文章站
2022-04-14 09:30:51
老师给了一个100m左右的数据(.csv 格式),然后让我们用python 和excel 进行分析。
具体要求和代码:
t2.py
import sqlite3
import pandas a...
老师给了一个100m左右的数据(.csv 格式),然后让我们用python 和excel 进行分析。
具体要求和代码:
t2.py
import sqlite3 import pandas as pd #### main ################################################## ##分为两个步骤,先是对数据读取后进行处理 ##然后将数据放入数据库 if __name__ == '__main__': ##三个文件路径 ########### 对数据读取后进行处理 ############################################################ filename = ['./data/opendata_order201608_afterclear.csv', './data/opendata_order201609_afterclear.csv', './data/opendata_order201610_afterclear.csv'] ##对每个文件进行处理 for index in range(3): ##一行行处理数据,将所需要增加的都放在后面,不影响结果,但是操作更加方便 rfile = open(filename[index], "r") # 用来临时保存数据的文件,opendata_order201608.csv targetfile = filename[index][7:27] + '.csv' wfile = open(targetfile, "w") # print(targetfile) linecount = 0 dataline = rfile.readline() # 读取一行数据 while dataline: linecount += 1 # 分割数据,用来接下来的处理 datalist = dataline.split(",") # print(datalist) ##对第一行,即header进行处理,增加所要求的内容 if (linecount == 1): datalist.append('month') datalist.append('year') datalist.append('area_name') datalist.append('brand_code') # print(datalist) else: # 对具体内容进行处理,增加所要求的内容 month = (datalist[0][4:6]) # year = (datalist[0][0:4]) # print(year) area_name = 'qx' + datalist[1][-3:] # print(area_name) brand_code = datalist[3][0:6] # print(brand_code) datalist.append(month) datalist.append(year) datalist.append(area_name) datalist.append(brand_code) # print(datalist) ##将内容整理好,放入临时文件 linecontent = "" rolcount = 0 for col in datalist: rolcount += 1 if (rolcount == 1): linecontent += col.strip() else: linecontent += "," + col.strip() # 一行行写入 wfile.write(linecontent + "\n") # print(linecontent) dataline = rfile.readline() print("数据行数=", linecount) rfile.close() wfile.close() ########## 将数据放入数据库 ################################### # 连接数据库 conn = sqlite3.connect("./db/saledata.db") ##采用pandas 的一个函数,可以很方便的直接将数据从文件中写入到数据库 df = pd.read_csv(targetfile) df.to_sql(filename[index][7:27], conn, if_exists='append', index=false)
t3.py
import sqlite3 # 连接数据库 conn = sqlite3.connect('./db/saledata.db') # 查询语句 query_sql = ''' select opendata_order201608.kind, opendata_order201608.area_name, sum(opendata_order201608.purch_tax_amt) from opendata_order201608 group by opendata_order201608.kind, opendata_order201608.area_name ''' ##进行查询 query = conn.execute(query_sql) ##将查询结果写入 filename = './txt/areakind.txt' fd = open(filename, 'w', encoding='utf-8') print('烟的种类', '县区', '销售利润', file=fd, sep=',') for kind in query: print(kind[0], kind[1], kind[2], file=fd, sep=',') fd.close() # print(query[1][1])
t4.py
import sqlite3 # 连接数据库 conn = sqlite3.connect('./db/saledata.db') ############################################### ##对每个月的查询分开处理,先是8月 # 查询语句 query_sql = ''' select opendata_order201608.cust_code,avg(purch_tax_amt) from opendata_order201608 group by opendata_order201608.cust_code ''' ##查询 query = conn.execute(query_sql) ##将查询结果进行写入 filename = './txt/t4.txt' num = 0 fd = open(filename, 'w', encoding='utf-8') print('月份', '商家', '销售利润', file=fd, sep=',') for kind in query: num += 1 print('8月', kind[0], kind[1], file=fd, sep=',') print('8月商户数量:', num) ################### 9月 ###################################### query_sql = ''' select opendata_order201609.cust_code,avg(purch_tax_amt) from opendata_order201609 group by opendata_order201609.cust_code ''' query = conn.execute(query_sql) filename = './txt/t4.txt' num = 0 fd = open(filename, 'a', encoding='utf-8') # print('月份','商家','销售利润',file=fd,sep=',') for kind in query: num += 1 print('9月', kind[0], kind[1], file=fd, sep=',') print('9月商户数量:', num) ################### 10月 #################################### query_sql = ''' select opendata_order201610.cust_code,avg(purch_tax_amt) from opendata_order201610 group by opendata_order201610.cust_code ''' query = conn.execute(query_sql) filename = './txt/t4.txt' num = 0 fd = open(filename, 'a', encoding='utf-8') # print('月份','商家','销售利润',file=fd,sep=',') for kind in query: num += 1 print('10月', kind[0], kind[1], file=fd, sep=',') print('10月商户数量:', num) fd.close()t5.py
import sqlite3 ##连接数据库 conn = sqlite3.connect('./db/saledata.db') ##对每个月的查询分开处理,先是8月############################################## query_sql = ''' select opendata_order201608.item_code, sum(opendata_order201608.purch_qty), sum(opendata_order201608.purch_tax_amt)/sum(opendata_order201608.purch_qty) from opendata_order201608 group by opendata_order201608.item_code ''' query = conn.execute(query_sql) filename = './txt/t5.txt' num = 0 fd = open(filename, 'w', encoding='utf-8') print('月', '规格', '数量', '单价', file=fd, sep=',') for kind in query: if (kind[1] == 0): continue num += 1 print('8月', kind[0], kind[1], kind[2], file=fd, sep=',') print(num) #################### 9月 ############################################ query_sql = ''' select opendata_order201609.item_code, opendata_order201609.purch_qty, opendata_order201609.purch_tax_amt/opendata_order201609.purch_qty from opendata_order201609 group by opendata_order201609.item_code ''' query = conn.execute(query_sql) filename = './txt/t5.txt' num = 0 fd = open(filename, 'a', encoding='utf-8') # print('月','规格','数量','单价',file=fd,sep=',') for kind in query: if (kind[1] == 0): continue num += 1 print('9月', kind[0], kind[1], kind[2], file=fd, sep=',') print(num) ####################### 10月 ################################################## query_sql = ''' select opendata_order201610.item_code, opendata_order201610.purch_qty, opendata_order201610.purch_tax_amt/opendata_order201610.purch_qty from opendata_order201610 group by opendata_order201610.item_code ''' query = conn.execute(query_sql) filename = './txt/t5.txt' num = 0 fd = open(filename, 'a', encoding='utf-8') # print('月','规格','数量','单价',file=fd,sep=',') for kind in query: if (kind[1] == 0): continue num += 1 print('10月', kind[0], kind[1], kind[2], file=fd, sep=',') print(num) fd.close()
t6.py
import numpy import sqlite3 # 连接数据库 conn = sqlite3.connect('./db/saledata.db') ##由于需要知道每一个商家的销售额占总销售额的比例,所以需要查询两次 # 第一次 每一个商家的销售额 query_sql = ''' select opendata_order201608.cust_code, sum(opendata_order201608.purch_tax_amt) from opendata_order201608 group by opendata_order201608.cust_code order by sum(opendata_order201608.purch_tax_amt) desc ''' query = conn.execute(query_sql) # 第二次 总的销售额 query_sql_total = ''' select sum(opendata_order201608.purch_tax_amt) from opendata_order201608 ''' query_total = conn.execute(query_sql_total) total_sum = query_total.fetchone()[0] ## 写入 #################################################### filename = './txt/t6.txt' num_total = 0 # 总的商家数 fd = open(filename, 'w', encoding='utf-8') print('商户', '销售金额', '占比', file=fd, sep=',') zhanbi_rate = 0 # 销售收入占比率 flag_num = 0 # 当超过80时,商家的个数 flag = 0 # 用来标记第一次超过80% for kind in query: num_total += 1 zhanbi = round((kind[1]) / (total_sum) * 100, 5) # 百分比 保留五位小数 zhanbi_rate += zhanbi if zhanbi_rate > 80 and flag == 0: flag_num = num_total print('当商户的销售占比率超过80%,商户的数量: ', flag_num) flag = 1 print(kind[0], kind[1], zhanbi, file=fd, sep=',') print('总的商户数量 :', num_total) print('商户占比率 ', round(flag_num / num_total, 5))
t7.py
import sqlite3 import numpy as np # 连接数据库 conn = sqlite3.connect('./db/saledata.db') ##### 平均值 ############################## query_sql = ''' select opendata_order201608.cust_code, avg(opendata_order201608.purch_tax_amt) from opendata_order201608 group by opendata_order201608.cust_code order by opendata_order201608.cust_code asc ''' query = conn.execute(query_sql) ###### 方差 ############################ query_sql_per = ''' select opendata_order201608.cust_code, opendata_order201608.purch_tax_amt from opendata_order201608 order by opendata_order201608.cust_code asc ''' query_per = conn.execute(query_sql_per) purch_per = [] # 用来保存每一个商家的销售额 fangcha = {} # 用来保存每一个商家的销售额的方差 cust_code = 1 # 用来保存每一个商家的cust_code last_custcode = 0 # 用来保存最后一个商家的cust_code for i in query_per.fetchall(): # 如果当前cust_code不等于之前的cust_code,则说明上一个商家的记录结束,进行处理 if i[0] != cust_code: # print(purch_per) # 计算方差 cust_fangcha = round(np.std(purch_per, ddof=1), 5) # print(cust_fangcha) # 用字典保存方差 fangcha[cust_code] = cust_fangcha # 更新 cust_code cust_code = i[0] ##将这个[]清空 purch_per = [] # 保存销售额 purch_per.append(i[1]) last_custcode = i[0] ###在之前的操作中,没有计算最后一个商家的方差,这里加上 last_fangcha = round(np.std(purch_per, ddof=1), 5) fangcha[last_custcode] = last_fangcha ############ 写入 ############################## filename = './txt/t7.txt' fd = open(filename, 'w', encoding='utf-8') print('商家', '平均', '方差', file=fd, sep=',') # fnagchanum=0 for kind in query: print(kind[0], round(kind[1], 5), fangcha[kind[0]], file=fd, sep=',') # fnagchanum += 1 fd.close()
t8.py
import sqlite3 # 连接数据库 conn = sqlite3.connect('./db/saledata.db') # 查询语句 query_sql = ''' select opendata_order201608.cust_code, sum(opendata_order201608.purch_tax_amt) from opendata_order201608 group by opendata_order201608.cust_code order by sum(purch_tax_amt) desc ''' # 查询 query = conn.execute(query_sql) # 遍历一遍,计算商家数量 num_cust = 0 for kind in query: num_cust += 1 # 一个很有趣的现象,如果查询结果被遍历一边,然后query就为[],所以需要再遍历一遍 query_again = conn.execute(query_sql) numagain = 0 # 每个等分中 商家的销售额 sumper = [0 for i in range(5)] # print(query.fetchall()) for per in query_again: index = int(numagain / (num_cust / 5)) # print(index) sumper[index] += per[1] # print('-'*16) numagain += 1 # 保存所要求的倍数 rate = [] for index in range(5): # 计算倍数,写入list rate.append(round(sumper[0] / sumper[index], 3)) print(rate[index]) # print(sumper) ##写入 filename = './txt/t8.txt' fd = open(filename, 'w', encoding='utf-8') print('等分', '比值', file=fd, sep=',') for index in range(5): print(index, rate[index], file=fd, sep=',') fd.close()
t9.py
import sqlite3 #在这一道题中,我想要看一下每一类烟的平均单价与销售数量,看看有什么联系 ##连接数据库 conn = sqlite3.connect('./db/saledata.db') ##对每个月的查询分开处理,先是8月############################################## query_sql = ''' select opendata_order201608.brand_code, sum(opendata_order201608.purch_tax_amt)/sum(opendata_order201608.purch_qty), sum(opendata_order201608.purch_qty) from opendata_order201608 group by opendata_order201608.brand_code order by sum(opendata_order201608.purch_tax_amt)/sum(opendata_order201608.purch_qty) desc ''' query = conn.execute(query_sql) filename = './txt/t9.txt' num = 0 fd = open(filename, 'w', encoding='utf-8') print( '品牌', '单价', '数量', file=fd, sep=',') for kind in query: if (kind[1] == 0): continue num += 1 print( kind[0], kind[1], kind[2], file=fd, sep=',') print(num)