Pandas meets probabilities

Showcasing Pandas on-the-fly computation capabilities to calculate probabilities on timeseries.

Posted by dr. Etienne B. Roesch on April 19, 2017

You may find interesting:


New Notebook33

Showcasing Jupyter uNotebook Translator Layout


New Notebook

Showcasing Jupyter uNotebook Translator Layout

Pandas is extremely good at manipulating data. It can also compute new information on the fly. For a project, I had to manipulate timeseries data, to compute the probability that one variable be of values {-1, 0, 1, 2} given past values of the same half hour, every week.

# Import all libraries needed for the tutorial
import pandas as pd
import pandasql as ps
import numpy as np
import matplotlib.pyplot as plt
import sys # Only needed to determine Python version number
import matplotlib # Only needed to determine Matplotlib version number

from matplotlib.pylab import rcParams

# Enable inline plotting
%matplotlib inline

rcParams['figure.figsize'] = 10,10

print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
print('Matplotlib version ' + matplotlib.__version__)

dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

def add_half_hour(x):
    """Add a new column with half hour indices given a Dataframe row."""
    x['half_hour'] = (x.index.hour * 60 + x.index.minute) / 30
    return x

def calc_probabilities(s):
    """Returns a probability given a Series object."""
    temp = s.value_counts()
    return temp/temp.sum()
%%time
# Read data and create index as timestamps on-the-fly
logging_data = pd.read_csv("./logging_data.csv", parse_dates=['server_timestamp'],
                           index_col='server_timestamp', date_parser=dateparse)
# Displays some information about the dataset
#logging_data.info()
#print logging_data.head()
#print '\nData Types:'
#print logging_data.dtypes
#logging_data.index
%%time
#.. take one charger
grouped = logging_data.groupby('charger_id')
group =  list(grouped)[2][1]
group.head()  # Shows first few rows
%%time
#.. filter just one day
group['day'] = group['cpc_timestamp'].apply(lambda x: dateparse(x).weekday())
myday_group = group[group['day'] == 2]
myday_group.head()
%%time
#.. add half_hour indices
df2 = myday_group.groupby('charger_id').apply(add_half_hour)
%%time
#.. calculate probabilities
res = df2.groupby('half_hour')['ev_connection_status'].apply(calc_probabilities).sort_index()
res
res.unstack(level=-1).plot(kind='bar', stacked=True)