Finding, filtering and converting Series to NaN
This introduction to pandas is derived from Data School's pandas Q&A with my own notes and code.
Filtering and Converting Series to NaN¶
Simply use .loc only for slicing a DataFrame
In [1]:
import pandas as pd
In [2]:
url = 'http://bit.ly/imdbratings'
movies = pd.read_csv(url)
In [3]:
movies.head()
Out[3]:
In [4]:
# counting missing values
movies.content_rating.isnull().sum()
Out[4]:
In [5]:
movies.loc[movies.content_rating.isnull(), :]
Out[5]:
In [12]:
# counting content_rating unique values
# you can see there're 65 'NOT RATED' and 3 'NaN'
# we want to combine all to make 68 NaN
movies.content_rating.value_counts(dropna=False)
Out[12]:
In [13]:
# examining content_rating's 'NOT RATED'
movies.loc[movies.content_rating=='NOT RATED', :]
Out[13]:
In [8]:
# filtering only 1 column
movies.loc[movies.content_rating=='NOT RATED', 'content_rating']
Out[8]:
In [9]:
import numpy as np
In [14]:
type(movies.loc[movies.content_rating=='NOT RATED', 'content_rating'])
Out[14]:
In [15]:
# there's no error here
# however, if you use other methods of slicing, it would output an error
# equating this series to np.nan converts all to 'NaN'
movies.loc[movies.content_rating=='NOT RATED', 'content_rating'] = np.nan
In [17]:
# it has changed from 65 to 68
movies.content_rating.isnull().sum()
Out[17]:
Second example: SettingWithCopyWarning
In [18]:
# select top_movies
top_movies = movies.loc[movies.star_rating >= 9, :]
In [19]:
top_movies
Out[19]:
In [22]:
# there's a SettingWithCopyWarning here because Pandas is not sure if the DataFrame is a view or copy
top_movies.loc[0, 'duration'] = 150
In [23]:
top_movies
Out[23]:
In [25]:
# to get rid of the error, always use .copy()
top_movies = movies.loc[movies.star_rating >= 9, :].copy()
In [27]:
top_movies.loc[0, 'duration'] = 150