This introduction to pandas is derived from Data School's pandas Q&A with my own notes and code.
Finding and removing duplicate rows in pandas¶
In [4]:
import pandas as pd
In [6]:
# read a dataset of movie reviewers (modifying the default parameter values for read_table)
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
url = 'http://bit.ly/movieusers'
users = pd.read_table(url, sep='|', header=None, names=user_cols, index_col='user_id')
In [7]:
users.head()
Out[7]:
In [8]:
users.shape
Out[8]:
If we want to identify duplicate zip_code rows
In [15]:
# use df.cat_name.duplicated()
# output True if row above is the same
users.zip_code.duplicated()
Out[15]:
In [10]:
# type
type(users.zip_code.duplicated())
Out[10]:
In [12]:
# we can use .count() since it's a series
# there're 148 duplicates
users.zip_code.duplicated().sum()
Out[12]:
In [13]:
# it will output True if entire row is duplicated (row above)
users.duplicated()
Out[13]:
In [17]:
# examine duplicated rows
users.loc[users.duplicated(), :]
Out[17]:
In [18]:
# keep='first'
# mark duplicates as True except for the first occurence
users.loc[users.duplicated(keep='first'), :]
Out[18]:
In [19]:
# keep='last'
# 7 rows that are counted as duplicates, keeping the later one
# this is useful for splitting the data
users.loc[users.duplicated(keep='last'), :]
Out[19]:
In [22]:
# mark all duplicates as True
# this combines the two tables above
users.loc[users.duplicated(keep=False), :]
Out[22]:
Dropping duplicates
In [23]:
# drops the 7 rows
users.drop_duplicates(keep='first').shape
Out[23]:
In [24]:
# drops the last version of the 7 duplicate rows
users.drop_duplicates(keep='last').shape
Out[24]:
In [25]:
# drops all 14 rows
users.drop_duplicates(keep=False).shape
Out[25]:
In [28]:
# only consider "age" and "zip_code" as the relevant columns
users.duplicated(subset=['age', 'zip_code']).sum()
Out[28]: