Python数据处理工具使用方法整理

时间：2018-07-21 20:58:11 阅读：210 评论：0 收藏：0 [点我收藏+]

# 从csv文件创建DataFrame类型的数据结构

>>>df=pd.read_csv("xxx.csv")

# DataFrame类型的形状和长度

>>> df.shape
(38, 39)
>>> len(df)
38

# 各列的标题和数据类型

>>> df.columns
>>> df.dtypes

# 索引

>>> df.index
RangeIndex(start=0, stop=38, step=1)

# 将DataFrame转化成Numpy数组

>>>df.values

# 查看变量类型

>>> type(df)
<class ‘pandas.core.frame.DataFrame‘>

# 获取DataFrame的一列（得到的数据类型是Series）

>>> type(df)
<class ‘pandas.core.frame.DataFrame‘>
>>> col=df[‘104‘]
>>> type(col)
<class ‘pandas.core.series.Series‘>

# Series中与DataFrame相似的属性

>>> col.shape
(38,)
>>> col.values
array([301,  1051,  1657,  1852,  2057,  2258,  2938,  3418, 3718,  3938,  4148,  4568,  5068])
>>> col.index
RangeIndex(start=0, stop=38, step=1)
>>> col.name
‘104‘

# 截取最后几个元素

>>> col[-2:]
36    65536
37    65536
Name: 104, dtype: int64
>>> type(col[-2:])
<class ‘pandas.core.series.Series‘>

# DataFrame的符号

>>> np.sign(df)
>>> last_col=df.columns[-1]
>>> np.sign(df[last_col])

# head（取前几行）和tail（取后几行）

>>> df.head(2)
>>> df.tail(2)

# 按索引查找某一行数据

>>> last_col=df.index[-1]
>>> last_col
>>> df.iloc[last_col]

# 按索引查找某一行的某一列数据

>>> df.iloc[2:9]
# iloc和iat作用相同
>>> df.iloc[2,3]
>>> df.iat[2,3]

# 逻辑查找

>>> df[df>df.mean()]

# 统计计算

# 描述信息
>>> df.describe()
# 非空数据的数量
>>> df.count()
# 平均绝对偏差（类似于标准差）
>>> df.mad()
# 中位数
>>> df.median()
# 最小值
>>> df.min()
# 最大值
>>> df.max()
# 众数
>>> df.mode()
# 标准差
>>> df.std()
# 方差
>>> df.var()
# 偏态系数（skewness,表示数据的对称程度）
>>> df.skew()
# 峰态函数（kurtosis,表示数据分布图的尖扁程度）
>>> df.kurt()

# 用python字典生成DataFrame
>>> df=pd.DataFrame({‘weather‘:[‘cold‘,‘hot‘],‘food‘:[‘soup‘,‘ice cream‘]})
>>> df
        food weather
0       soup    cold
1  ice cream     hot

# 对某个属性按类型分组

>>> group=df.groupby(‘weather‘)
>>> for name,gro in group:
...     print(name)
...     print(gro)
... 
cold
   food weather
0  soup    cold
2  cake    cold
hot
        food weather
1  ice cream     hot
3      bread     hot
>>> group
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7f110c24d1d0>

# 各分组第一行、最后一行、平均数

>>> group=df.groupby(‘weather‘)
>>> group.first()
              food  price
weather                  
cold          soup      1
hot      ice cream      2
>>> group.last()
          food  price
weather              
cold      cake      3
hot      bread      4
>>> group.mean()
         price
weather       
cold         2
hot          3

# 查看分组

>>> g=df.groupby([‘weather‘,‘food‘])
>>> g.groups
{(‘hot‘, ‘bread‘): Int64Index([3], dtype=‘int64‘), (‘cold‘, ‘cake‘): Int64Index([2], dtype=‘int64‘), (‘hot‘, ‘ice cream‘): Int64Index([1], dtype=‘int64‘), (‘cold‘, ‘soup‘): Int64Index([0], dtype=‘int64‘)}

# 为分组追加属性

>>> g.agg([np.mean])
                  price
                   mean
weather food           
cold    cake          3
        soup          1
hot     bread         4
        ice cream     2

# 截取几行数据并连接

>>> d=pd.concat([df[:2],df[3:]])

>>> d
>>> d=pd.concat([df[:2],df[3:]])
>>> d
        food  price weather
0       soup      1    cold
1  ice cream      2     hot
3      bread      4     hot
>>> d.append(df[3:])
        food  price weather
0       soup      1    cold
1  ice cream      2     hot
3      bread      4     hot
3      bread      4     hot

Python数据处理工具使用方法整理

原文：https://www.cnblogs.com/kisetsu/p/9347825.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)