首页 > 其他 > 详细

pandas数据清洗(二)

时间:2019-06-01 19:31:10      阅读:66      评论:0      收藏:0      [点我收藏+]
import pandas as pd
import numpy as np
from pandas import DataFrame
import datetime
import sys
import pymysql
import csv
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


# db = pymysql.connect(localhost, root, 123456, languid)
engine = create_engine(mysql+pymysql://root:123456@localhost/languid?charset=utf8)
db = scoped_session(sessionmaker(bind=engine))


col_list = [user, tm_type, serv, app, record_time, up_flux, down_flux]#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量

filepath=C://百度网盘//20181007_flux_40.csv
# def data_deal(filepath):
if __name__ == __main__:
    df_flux = pd.read_csv(filepath, sep=,, error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine=python,encoding = "utf-8",nrows=22222)
    df_flux.dropna(how=all,inplace=True)
    df_flux.dropna(subset=[user],inplace=True,axis=0)
    df_flux[record_time]=2019-5-28
    df_flux2 = df_flux.groupby(by=[user,tm_type,serv,app,record_time])[up_flux,down_flux].sum()
    df_flux3 = df_flux.groupby(by=[user, tm_type, serv, app, record_time]).count()
    df_flux4 = df_flux3.drop([down_flux], axis=1)
    df_flux5 = df_flux4.rename(columns={up_flux: counts}, inplace=False)
  
    df_flux2=DataFrame(df_flux2)
    df_flux2 = df_flux2.rename(columns={up_flux: up_flux_sum,down_flux:down_flux_sum})
   
    result = pd.concat([df_flux5, df_flux2], axis=1)
    
    print(result)

1.清洗数据中的全空行 2.清洗user列中的空值的行 3.统计上行流量列以及下行流量列的当天每人每终端服务app的总量。 4.统计每人每天终端服务app的次数。

 

import pandas as pd
import numpy as np
from pandas import DataFrame
import datetime
import sys
import pymysql
import csv
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


# db = pymysql.connect(localhost, root, 123456, languid)
engine = create_engine(mysql+pymysql://root:123456@localhost/languid?charset=utf8)
db = scoped_session(sessionmaker(bind=engine))


col_list = [user, tm_type, serv, app, record_time, up_flux, down_flux]#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量
student_list=[user,age,low,high,time]

filepath=C://百度网盘//20181007_flux_40.csv
filepath2=C://百度网盘//v_student_net.csv
# def data_deal(filepath):
if __name__ == __main__:
    df_flux = pd.read_csv(filepath, sep=,, error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine=python,encoding = "utf-8")
    df_flux.dropna(how=all,inplace=True)
    df_flux.dropna(subset=[user],inplace=True,axis=0)
    df_flux[record_time]=2019-5-28
    df_flux2 = df_flux.groupby([user, tm_type, serv, app, record_time], as_index=False)[up_flux, down_flux].sum()
    df_flux3 = df_flux.groupby(by=[user, tm_type, serv, app, record_time],as_index=False).count()
    df_flux4 = df_flux3.drop([down_flux], axis=1)
    df_flux5 = df_flux4.rename(columns={up_flux: counts}, inplace=False)
    df_flux2=DataFrame(df_flux2)
    df_flux2 = df_flux2.rename(columns={up_flux: up_flux_sum,down_flux:down_flux_sum})
    result = pd.concat([df_flux2, df_flux5[counts]], axis=1)
    result_1 = df_flux2[~df_flux2[user].str.contains(10\.)]
    result_1[down_flux_sum] = result_1[down_flux_sum].astype(float)
    # result_1[user] = result_1[user].astype(float)
    # qqq = result_1[result_1[user]]
    result_1[tm_type].replace(\/移动终端\/\w*系统移动终端,mobile,regex=True,inplace=True)
    result_1.loc[result_1[tm_type].str.contains(多终端),tm_type]=多终端
    result_1.loc[result_1[tm_type].str.contains(未知类型), tm_type] = Unknown
    result_1[tm_type].replace(\/PC\/MAC PC,PC,regex=True,inplace=True)




    v_student = pd.read_csv(filepath2,sep=,,error_bad_lines=False,engine=python,encoding=utf-8,header=0,index_col=[0])
    unique_value = v_student[username].nunique()
    v_student = v_student.rename(columns={username: user}, inplace=False)
    student_merge=pd.merge(v_student,result_1,how=inner)
    student_group = student_merge.groupby([class_code],as_index=False)[down_flux_sum]
    student_group_2 =student_merge.groupby([class_code],as_index=False)[up_flux_sum].count()
    student_group_3 = student_group_2.rename(columns={up_flux_sum: counts}, inplace=False)

1.用正则表达以及loc清洗tm_type列的数据,做以下更改

系统移动终端=mobile()

pc=pc()

多终端=多终端()

未知=unknown()

2.ip数据过滤() 将user列中的为ip的数据行过滤

3.类型转换=上行流量转化成其他类型()

pandas数据清洗(二)

原文:https://www.cnblogs.com/languid/p/10960559.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!