Tidy Data in Python
by Jean-Nicholas Hould
import pandas as pd
import datetime
from os import listdir
from os.path import isfile, join
import glob
import re
Column headers are values, not variable names
Pew Research Center Dataset
df = pd.read_csv("./data/pew-raw.csv")
df
|
religion |
<$10k |
$10-20k |
$20-30k |
$30-40k |
$40-50k |
$50-75k |
0 |
Agnostic |
27 |
34 |
60 |
81 |
76 |
137 |
1 |
Atheist |
12 |
27 |
37 |
52 |
35 |
70 |
2 |
Buddhist |
27 |
21 |
30 |
34 |
33 |
58 |
3 |
Catholic |
418 |
617 |
732 |
670 |
638 |
1116 |
4 |
Dont know/refused |
15 |
14 |
15 |
11 |
10 |
35 |
5 |
Evangelical Prot |
575 |
869 |
1064 |
982 |
881 |
1486 |
6 |
Hindu |
1 |
9 |
7 |
9 |
11 |
34 |
7 |
Historically Black Prot |
228 |
244 |
236 |
238 |
197 |
223 |
8 |
Jehovahs Witness |
20 |
27 |
24 |
24 |
21 |
30 |
9 |
Jewish |
19 |
19 |
25 |
25 |
30 |
95 |
formatted_df = pd.melt(df,["religion"], var_name="income", value_name="freq")
formatted_df = formatted_df.sort_values(by=["religion"])
formatted_df.head(10)
|
religion |
income |
freq |
0 |
Agnostic |
<$10k |
27 |
30 |
Agnostic |
$30-40k |
81 |
40 |
Agnostic |
$40-50k |
76 |
50 |
Agnostic |
$50-75k |
137 |
10 |
Agnostic |
$10-20k |
34 |
20 |
Agnostic |
$20-30k |
60 |
41 |
Atheist |
$40-50k |
35 |
21 |
Atheist |
$20-30k |
37 |
11 |
Atheist |
$10-20k |
27 |
31 |
Atheist |
$30-40k |
52 |
Billboard Top 100 Dataset
df = pd.read_csv("./data/billboard.csv", encoding="mac_latin2")
df.head(10)
|
year |
artist.inverted |
track |
time |
genre |
date.entered |
date.peaked |
x1st.week |
x2nd.week |
x3rd.week |
... |
x67th.week |
x68th.week |
x69th.week |
x70th.week |
x71st.week |
x72nd.week |
x73rd.week |
x74th.week |
x75th.week |
x76th.week |
0 |
2000 |
Destiny‘s Child |
Independent Women Part I |
3:38 |
Rock |
2000-09-23 |
2000-11-18 |
78 |
63.0 |
49.0 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
1 |
2000 |
Santana |
Maria, Maria |
4:18 |
Rock |
2000-02-12 |
2000-04-08 |
15 |
8.0 |
6.0 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2 |
2000 |
Savage Garden |
I Knew I Loved You |
4:07 |
Rock |
1999-10-23 |
2000-01-29 |
71 |
48.0 |
43.0 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3 |
2000 |
Madonna |
Music |
3:45 |
Rock |
2000-08-12 |
2000-09-16 |
41 |
23.0 |
18.0 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
4 |
2000 |
Aguilera, Christina |
Come On Over Baby (All I Want Is You) |
3:38 |
Rock |
2000-08-05 |
2000-10-14 |
57 |
47.0 |
45.0 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
5 |
2000 |
Janet |
Doesn‘t Really Matter |
4:17 |
Rock |
2000-06-17 |
2000-08-26 |
59 |
52.0 |
43.0 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
6 |
2000 |
Destiny‘s Child |
Say My Name |
4:31 |
Rock |
1999-12-25 |
2000-03-18 |
83 |
83.0 |
44.0 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
7 |
2000 |
Iglesias, Enrique |
Be With You |
3:36 |
Latin |
2000-04-01 |
2000-06-24 |
63 |
45.0 |
34.0 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
8 |
2000 |
Sisqo |
Incomplete |
3:52 |
Rock |
2000-06-24 |
2000-08-12 |
77 |
66.0 |
61.0 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
9 |
2000 |
Lonestar |
Amazed |
4:25 |
Country |
1999-06-05 |
2000-03-04 |
81 |
54.0 |
44.0 |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
10 rows × 83 columns
# Melting
id_vars = ["year","artist.inverted","track","time","genre","date.entered","date.peaked"]
df = pd.melt(frame=df,id_vars=id_vars, var_name="week", value_name="rank")
# Formatting
df["week"] = df[‘week‘].str.extract(‘(\d+)‘, expand=False).astype(int)
df["rank"] = df["rank"].astype(int)
# Cleaning out unnecessary rows
df = df.dropna()
# Create "date" columns
df[‘date‘] = pd.to_datetime(df[‘date.entered‘]) + pd.to_timedelta(df[‘week‘], unit=‘w‘) - pd.DateOffset(weeks=1)
df = df[["year", "artist.inverted", "track", "time", "genre", "week", "rank", "date"]]
df = df.sort_values(ascending=True, by=["year","artist.inverted","track","week","rank"])
# Assigning the tidy dataset to a variable for future usage
billboard = df
df.head(10)
|
year |
artist.inverted |
track |
time |
genre |
week |
rank |
date |
246 |
2000 |
2 Pac |
Baby Don‘t Cry (Keep Ya Head Up II) |
4:22 |
Rap |
1 |
87 |
2000-02-26 |
563 |
2000 |
2 Pac |
Baby Don‘t Cry (Keep Ya Head Up II) |
4:22 |
Rap |
2 |
82 |
2000-03-04 |
880 |
2000 |
2 Pac |
Baby Don‘t Cry (Keep Ya Head Up II) |
4:22 |
Rap |
3 |
72 |
2000-03-11 |
1197 |
2000 |
2 Pac |
Baby Don‘t Cry (Keep Ya Head Up II) |
4:22 |
Rap |
4 |
77 |
2000-03-18 |
1514 |
2000 |
2 Pac |
Baby Don‘t Cry (Keep Ya Head Up II) |
4:22 |
Rap |
5 |
87 |
2000-03-25 |
1831 |
2000 |
2 Pac |
Baby Don‘t Cry (Keep Ya Head Up II) |
4:22 |
Rap |
6 |
94 |
2000-04-01 |
2148 |
2000 |
2 Pac |
Baby Don‘t Cry (Keep Ya Head Up II) |
4:22 |
Rap |
7 |
99 |
2000-04-08 |
287 |
2000 |
2Ge+her |
The Hardest Part Of Breaking Up (Is Getting Ba... |
3:15 |
R&B |
1 |
91 |
2000-09-02 |
604 |
2000 |
2Ge+her |
The Hardest Part Of Breaking Up (Is Getting Ba... |
3:15 |
R&B |
2 |
87 |
2000-09-09 |
921 |
2000 |
2Ge+her |
The Hardest Part Of Breaking Up (Is Getting Ba... |
3:15 |
R&B |
3 |
92 |
2000-09-16 |
Multiple types in one table
songs_cols = ["year", "artist.inverted", "track", "time", "genre"]
songs = billboard[songs_cols].drop_duplicates()
songs = songs.reset_index(drop=True)
songs["song_id"] = songs.index
songs.head(10)
|
year |
artist.inverted |
track |
time |
genre |
song_id |
0 |
2000 |
2 Pac |
Baby Don‘t Cry (Keep Ya Head Up II) |
4:22 |
Rap |
0 |
1 |
2000 |
2Ge+her |
The Hardest Part Of Breaking Up (Is Getting Ba... |
3:15 |
R&B |
1 |
2 |
2000 |
3 Doors Down |
Kryptonite |
3:53 |
Rock |
2 |
3 |
2000 |
3 Doors Down |
Loser |
4:24 |
Rock |
3 |
4 |
2000 |
504 Boyz |
Wobble Wobble |
3:35 |
Rap |
4 |
5 |
2000 |
98° |
Give Me Just One Night (Una Noche) |
3:24 |
Rock |
5 |
6 |
2000 |
A*Teens |
Dancing Queen |
3:44 |
Pop |
6 |
7 |
2000 |
Aaliyah |
I Don‘t Wanna |
4:15 |
Rock |
7 |
8 |
2000 |
Aaliyah |
Try Again |
4:03 |
Rock |
8 |
9 |
2000 |
Adams, Yolanda |
Open My Heart |
5:30 |
Gospel |
9 |
ranks = pd.merge(billboard, songs, on=["year","artist.inverted", "track", "time", "genre"])
ranks = ranks[["song_id", "date","rank"]]
ranks.head(10)
|
song_id |
date |
rank |
0 |
0 |
2000-02-26 |
87 |
1 |
0 |
2000-03-04 |
82 |
2 |
0 |
2000-03-11 |
72 |
3 |
0 |
2000-03-18 |
77 |
4 |
0 |
2000-03-25 |
87 |
5 |
0 |
2000-04-01 |
94 |
6 |
0 |
2000-04-08 |
99 |
7 |
1 |
2000-09-02 |
91 |
8 |
1 |
2000-09-09 |
87 |
9 |
1 |
2000-09-16 |
92 |
Multiple variables stored in one column
Tubercolosis Example
A few notes on the raw data set:
- The columns starting with "m" or "f" contain multiple variables:
- Sex ("m" or "f")
- Age Group ("0-14","15-24", "25-34", "45-54", "55-64", "65", "unknown")
- Mixture of 0s and missing values("NaN"). This is due to the data collection process and the distinction is important for this dataset.
df = pd.read_csv("./data/tb-raw.csv")
df
|
country |
year |
m014 |
m1524 |
m2534 |
m3544 |
m4554 |
m5564 |
m65 |
mu |
f014 |
0 |
AD |
2000 |
0.0 |
0.0 |
1.0 |
0.0 |
0 |
0 |
0.0 |
NaN |
NaN |
1 |
AE |
2000 |
2.0 |
4.0 |
4.0 |
6.0 |
5 |
12 |
10.0 |
NaN |
3.0 |
2 |
AF |
2000 |
52.0 |
228.0 |
183.0 |
149.0 |
129 |
94 |
80.0 |
NaN |
93.0 |
3 |
AG |
2000 |
0.0 |
0.0 |
0.0 |
0.0 |
0 |
0 |
1.0 |
NaN |
1.0 |
4 |
AL |
2000 |
2.0 |
19.0 |
21.0 |
14.0 |
24 |
19 |
16.0 |
NaN |
3.0 |
5 |
AM |
2000 |
2.0 |
152.0 |
130.0 |
131.0 |
63 |
26 |
21.0 |
NaN |
1.0 |
6 |
AN |
2000 |
0.0 |
0.0 |
1.0 |
2.0 |
0 |
0 |
0.0 |
NaN |
0.0 |
7 |
AO |
2000 |
186.0 |
999.0 |
1003.0 |
912.0 |
482 |
312 |
194.0 |
NaN |
247.0 |
8 |
AR |
2000 |
97.0 |
278.0 |
594.0 |
402.0 |
419 |
368 |
330.0 |
NaN |
121.0 |
9 |
AS |
2000 |
NaN |
NaN |
NaN |
NaN |
1 |
1 |
NaN |
NaN |
NaN |
df = pd.melt(df, id_vars=["country","year"], value_name="cases", var_name="sex_and_age")
# Extract Sex, Age lower bound and Age upper bound group
tmp_df = df["sex_and_age"].str.extract("(\D)(\d+)(\d{2})", expand=False)
# Name columns
tmp_df.columns = ["sex", "age_lower", "age_upper"]
# Create `age`column based on `age_lower` and `age_upper`
tmp_df["age"] = tmp_df["age_lower"] + "-" + tmp_df["age_upper"]
# Merge
df = pd.concat([df, tmp_df], axis=1)
# Drop unnecessary columns and rows
df = df.drop([‘sex_and_age‘,"age_lower","age_upper"], axis=1)
df = df.dropna()
df = df.sort_values(ascending=True,by=["country", "year", "sex", "age"])
df.head(10)
|
country |
year |
cases |
sex |
age |
0 |
AD |
2000 |
0.0 |
m |
0-14 |
10 |
AD |
2000 |
0.0 |
m |
15-24 |
20 |
AD |
2000 |
1.0 |
m |
25-34 |
30 |
AD |
2000 |
0.0 |
m |
35-44 |
40 |
AD |
2000 |
0.0 |
m |
45-54 |
50 |
AD |
2000 |
0.0 |
m |
55-64 |
81 |
AE |
2000 |
3.0 |
f |
0-14 |
1 |
AE |
2000 |
2.0 |
m |
0-14 |
11 |
AE |
2000 |
4.0 |
m |
15-24 |
21 |
AE |
2000 |
4.0 |
m |
25-34 |
Variables are stored in both rows and columns
Global Historical Climatology Network Dataset
df = pd.read_csv("./data/weather-raw.csv")
df = pd.melt(df, id_vars=["id", "year","month","element"], var_name="day_raw")
df.head(10)
|
id |
year |
month |
element |
day_raw |
value |
0 |
MX17004 |
2010 |
1 |
tmax |
d1 |
NaN |
1 |
MX17004 |
2010 |
1 |
tmin |
d1 |
NaN |
2 |
MX17004 |
2010 |
2 |
tmax |
d1 |
NaN |
3 |
MX17004 |
2010 |
2 |
tmin |
d1 |
NaN |
4 |
MX17004 |
2010 |
3 |
tmax |
d1 |
NaN |
5 |
MX17004 |
2010 |
3 |
tmin |
d1 |
NaN |
6 |
MX17004 |
2010 |
4 |
tmax |
d1 |
NaN |
7 |
MX17004 |
2010 |
4 |
tmin |
d1 |
NaN |
8 |
MX17004 |
2010 |
5 |
tmax |
d1 |
NaN |
9 |
MX17004 |
2010 |
5 |
tmin |
d1 |
NaN |
# Extracting day
df["day"] = df["day_raw"].str.extract("d(\d+)", expand=False)
df["id"] = "MX17004"
# To numeric values
df[["year","month","day"]] = df[["year","month","day"]].apply(lambda x: pd.to_numeric(x, errors=‘ignore‘))
# Creating a date from the different columns
def create_date_from_year_month_day(row):
return datetime.datetime(year=row["year"], month=int(row["month"]), day=row["day"])
df["date"] = df.apply(lambda row: create_date_from_year_month_day(row), axis=1)
df = df.drop([‘year‘,"month","day", "day_raw"], axis=1)
df = df.dropna()
# Unmelting column "element"
df = df.pivot_table(index=["id","date"], columns="element", values="value")
df.reset_index(drop=False, inplace=True)
df
element |
id |
date |
tmax |
tmin |
0 |
MX17004 |
2010-02-02 |
27.3 |
14.4 |
1 |
MX17004 |
2010-02-03 |
24.1 |
14.4 |
2 |
MX17004 |
2010-03-05 |
32.1 |
14.2 |
One type in multiple tables
Baby Names in Illinois
def extract_year(string):
match = re.match(".+(\d{4})", string)
if match != None: return match.group(1)
path = ‘./data‘
allFiles = glob.glob(path + "/201*-baby-names-illinois.csv")
frame = pd.DataFrame()
df_list= []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
df.columns = map(str.lower, df.columns)
df["year"] = extract_year(file_)
df_list.append(df)
df = pd.concat(df_list)
df.head(5)
|
rank |
name |
frequency |
sex |
year |
0 |
1 |
Noah |
837 |
Male |
2014 |
1 |
2 |
Alexander |
747 |
Male |
2014 |
2 |
3 |
William |
687 |
Male |
2014 |
3 |
4 |
Michael |
680 |
Male |
2014 |
4 |
5 |
Liam |
670 |
Male |
2014 |
Tidy Data in Python
原文:https://www.cnblogs.com/ministep/p/14589534.html