{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Pandas en 10 minutes" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
run previous cell, wait for 2 seconds
\n", "" ], "text/plain": [ "" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ " #Pour intégrer les graphes à votre notebook, il suffit de faire\n", "%matplotlib inline\n", "\n", "from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On importe générallement les librairies suivantes" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Création d'objets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On créé une 'Series' en lui passant une liste de valeurs, en laissant pandas créer un index d'entiers" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 3.0\n", "2 5.0\n", "3 NaN\n", "4 6.0\n", "5 8.0\n", "dtype: float64" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = pd.Series([1,3,5,np.nan,6,8])\n", "s" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On créé un DataFrame en passant un array numpy, avec un index sur sur une date et des colonnes labellisées" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',\n", " '2013-01-05', '2013-01-06'],\n", " dtype='datetime64[ns]', freq='D')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2013-01-01-1.184356-0.7238500.3910510.749447
2013-01-020.0275020.2420280.9371880.358945
2013-01-030.1210831.633698-0.134070-0.047994
2013-01-041.2475740.102763-0.254731-1.270130
2013-01-050.6053111.3390970.3136921.190966
2013-01-06-0.5829050.818028-0.2594980.305170
\n", "
" ], "text/plain": [ " A B C D\n", "2013-01-01 -1.184356 -0.723850 0.391051 0.749447\n", "2013-01-02 0.027502 0.242028 0.937188 0.358945\n", "2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n", "2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n", "2013-01-05 0.605311 1.339097 0.313692 1.190966\n", "2013-01-06 -0.582905 0.818028 -0.259498 0.305170" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates = pd.date_range('20130101', periods=6)\n", "print(dates)\n", "\n", "df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On peut également créer un DataFrame en passant un dictionnaire d'objets qui peut être converti en sorte de série." ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDEF
01.02013-01-021.03testfoo
11.02013-01-021.03trainfoo
21.02013-01-021.03testfoo
31.02013-01-021.03trainfoo
\n", "
" ], "text/plain": [ " A B C D E F\n", "0 1.0 2013-01-02 1.0 3 test foo\n", "1 1.0 2013-01-02 1.0 3 train foo\n", "2 1.0 2013-01-02 1.0 3 test foo\n", "3 1.0 2013-01-02 1.0 3 train foo" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = pd.DataFrame({ 'A' : 1.,\n", "'B' : pd.Timestamp('20130102'),\n", "'C' : pd.Series(1,index=list(range(4)),dtype='float32'),\n", "'D' : np.array([3] * 4,dtype='int32'),\n", "'E' : pd.Categorical([\"test\",\"train\",\"test\",\"train\"]),\n", "'F' : 'foo' })\n", "\n", "df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Chaque colonne a son propre dtypes" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "A float64\n", "B datetime64[ns]\n", "C float32\n", "D int32\n", "E category\n", "F object\n", "dtype: object" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On peut afficher les premières lignes et les dernières" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " A B C D\n", "2013-01-01 -1.184356 -0.723850 0.391051 0.749447\n", "2013-01-02 0.027502 0.242028 0.937188 0.358945\n", "2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n", "2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n", "2013-01-05 0.605311 1.339097 0.313692 1.190966\n", " A B C D\n", "2013-01-02 0.027502 0.242028 0.937188 0.358945\n", "2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n", "2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n", "2013-01-05 0.605311 1.339097 0.313692 1.190966\n", "2013-01-06 -0.582905 0.818028 -0.259498 0.305170\n" ] } ], "source": [ "print(df.head())\n", "print(df.tail())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On peut afficher l'index, les colonnes et les données numpy" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',\n", " '2013-01-05', '2013-01-06'],\n", " dtype='datetime64[ns]', freq='D')\n", "Index(['A', 'B', 'C', 'D'], dtype='object')\n", "[[-1.18435607 -0.72384974 0.39105091 0.74944703]\n", " [ 0.02750198 0.24202818 0.93718807 0.35894525]\n", " [ 0.12108336 1.63369769 -0.13407011 -0.04799366]\n", " [ 1.24757372 0.10276272 -0.25473099 -1.27013015]\n", " [ 0.60531084 1.33909665 0.3136916 1.19096551]\n", " [-0.58290532 0.81802772 -0.25949756 0.30516996]]\n" ] } ], "source": [ "print(df.index)\n", "print(df.columns)\n", "print(df.values)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "La méthode describe permet d'afficher un résumé des données" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
count6.0000006.0000006.0000006.000000
mean0.0390350.5686270.1656050.214401
std0.8569240.8699880.4722910.841637
min-1.184356-0.723850-0.259498-1.270130
25%-0.4303030.137579-0.2245660.040297
50%0.0742930.5300280.0898110.332058
75%0.4842541.2088290.3717110.651822
max1.2475741.6336980.9371881.190966
\n", "
" ], "text/plain": [ " A B C D\n", "count 6.000000 6.000000 6.000000 6.000000\n", "mean 0.039035 0.568627 0.165605 0.214401\n", "std 0.856924 0.869988 0.472291 0.841637\n", "min -1.184356 -0.723850 -0.259498 -1.270130\n", "25% -0.430303 0.137579 -0.224566 0.040297\n", "50% 0.074293 0.530028 0.089811 0.332058\n", "75% 0.484254 1.208829 0.371711 0.651822\n", "max 1.247574 1.633698 0.937188 1.190966" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On peut faire la transposée, trier en fonction d'un axe ou des valeurs" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06\n", "A -1.184356 0.027502 0.121083 1.247574 0.605311 -0.582905\n", "B -0.723850 0.242028 1.633698 0.102763 1.339097 0.818028\n", "C 0.391051 0.937188 -0.134070 -0.254731 0.313692 -0.259498\n", "D 0.749447 0.358945 -0.047994 -1.270130 1.190966 0.305170\n" ] } ], "source": [ "print(df.T)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DCBA
2013-01-010.7494470.391051-0.723850-1.184356
2013-01-020.3589450.9371880.2420280.027502
2013-01-03-0.047994-0.1340701.6336980.121083
2013-01-04-1.270130-0.2547310.1027631.247574
2013-01-051.1909660.3136921.3390970.605311
2013-01-060.305170-0.2594980.818028-0.582905
\n", "
" ], "text/plain": [ " D C B A\n", "2013-01-01 0.749447 0.391051 -0.723850 -1.184356\n", "2013-01-02 0.358945 0.937188 0.242028 0.027502\n", "2013-01-03 -0.047994 -0.134070 1.633698 0.121083\n", "2013-01-04 -1.270130 -0.254731 0.102763 1.247574\n", "2013-01-05 1.190966 0.313692 1.339097 0.605311\n", "2013-01-06 0.305170 -0.259498 0.818028 -0.582905" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_index(axis=1, ascending=False)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2013-01-01-1.184356-0.7238500.3910510.749447
2013-01-041.2475740.102763-0.254731-1.270130
2013-01-020.0275020.2420280.9371880.358945
2013-01-06-0.5829050.818028-0.2594980.305170
2013-01-050.6053111.3390970.3136921.190966
2013-01-030.1210831.633698-0.134070-0.047994
\n", "
" ], "text/plain": [ " A B C D\n", "2013-01-01 -1.184356 -0.723850 0.391051 0.749447\n", "2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n", "2013-01-02 0.027502 0.242028 0.937188 0.358945\n", "2013-01-06 -0.582905 0.818028 -0.259498 0.305170\n", "2013-01-05 0.605311 1.339097 0.313692 1.190966\n", "2013-01-03 0.121083 1.633698 -0.134070 -0.047994" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values(by='B')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Selection des données" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Getting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Selection d'une colonne (équivalent à df.A)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2013-01-01 -1.184356\n", "2013-01-02 0.027502\n", "2013-01-03 0.121083\n", "2013-01-04 1.247574\n", "2013-01-05 0.605311\n", "2013-01-06 -0.582905\n", "Freq: D, Name: A, dtype: float64\n", " A B C D\n", "2013-01-01 -1.184356 -0.723850 0.391051 0.749447\n", "2013-01-02 0.027502 0.242028 0.937188 0.358945\n", "2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n", " A B C D\n", "2013-01-02 0.027502 0.242028 0.937188 0.358945\n", "2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n", "2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n" ] } ], "source": [ "print(df['A'])\n", "print(df[0:3])\n", "print(df['20130102':'20130104'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Selection par Label" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "En utilisant un label" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "A -1.184356\n", "B -0.723850\n", "C 0.391051\n", "D 0.749447\n", "Name: 2013-01-01 00:00:00, dtype: float64" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[dates[0]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Selection de plusieurs axes par label" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
2013-01-01-1.184356-0.723850
2013-01-020.0275020.242028
2013-01-030.1210831.633698
2013-01-041.2475740.102763
2013-01-050.6053111.339097
2013-01-06-0.5829050.818028
\n", "
" ], "text/plain": [ " A B\n", "2013-01-01 -1.184356 -0.723850\n", "2013-01-02 0.027502 0.242028\n", "2013-01-03 0.121083 1.633698\n", "2013-01-04 1.247574 0.102763\n", "2013-01-05 0.605311 1.339097\n", "2013-01-06 -0.582905 0.818028" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[:,['A','B']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Avec le label slicing, les deux points de terminaisons sont INCLUS" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
2013-01-020.0275020.242028
2013-01-030.1210831.633698
2013-01-041.2475740.102763
\n", "
" ], "text/plain": [ " A B\n", "2013-01-02 0.027502 0.242028\n", "2013-01-03 0.121083 1.633698\n", "2013-01-04 1.247574 0.102763" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc['20130102':'20130104', ['A','B']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Obtenir une valeur scalaire" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "-1.184356071097634" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[dates[0],'A']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Acces plus rapide (méthode équivalente à la précédente)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "-1.184356071097634" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.at[dates[0],'A']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Selection par position" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Integer : " ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "A 1.247574\n", "B 0.102763\n", "C -0.254731\n", "D -1.270130\n", "Name: 2013-01-04 00:00:00, dtype: float64" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tranches d'entiers, similaire à numpy" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
2013-01-041.2475740.102763
2013-01-050.6053111.339097
\n", "
" ], "text/plain": [ " A B\n", "2013-01-04 1.247574 0.102763\n", "2013-01-05 0.605311 1.339097" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[3:5,0:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Par liste d'entiers" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AC
2013-01-020.0275020.937188
2013-01-030.121083-0.134070
2013-01-050.6053110.313692
\n", "
" ], "text/plain": [ " A C\n", "2013-01-02 0.027502 0.937188\n", "2013-01-03 0.121083 -0.134070\n", "2013-01-05 0.605311 0.313692" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[[1,2,4],[0,2]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Découpage de ligne explicite" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2013-01-020.0275020.2420280.9371880.358945
2013-01-030.1210831.633698-0.134070-0.047994
\n", "
" ], "text/plain": [ " A B C D\n", "2013-01-02 0.027502 0.242028 0.937188 0.358945\n", "2013-01-03 0.121083 1.633698 -0.134070 -0.047994" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[1:3,:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Obtenir une valeur explicitement" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.24202817975998675" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[1,1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Acces rapide au scalaire" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.24202817975998675" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iat[1,1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Indexation booléenne" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "En utilisant une valeur sur une colonne : " ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2013-01-020.0275020.2420280.9371880.358945
2013-01-030.1210831.633698-0.134070-0.047994
2013-01-041.2475740.102763-0.254731-1.270130
2013-01-050.6053111.3390970.3136921.190966
\n", "
" ], "text/plain": [ " A B C D\n", "2013-01-02 0.027502 0.242028 0.937188 0.358945\n", "2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n", "2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n", "2013-01-05 0.605311 1.339097 0.313692 1.190966" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.A > 0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Opérateur where : " ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2013-01-01NaNNaN0.3910510.749447
2013-01-020.0275020.2420280.9371880.358945
2013-01-030.1210831.633698NaNNaN
2013-01-041.2475740.102763NaNNaN
2013-01-050.6053111.3390970.3136921.190966
2013-01-06NaN0.818028NaN0.305170
\n", "
" ], "text/plain": [ " A B C D\n", "2013-01-01 NaN NaN 0.391051 0.749447\n", "2013-01-02 0.027502 0.242028 0.937188 0.358945\n", "2013-01-03 0.121083 1.633698 NaN NaN\n", "2013-01-04 1.247574 0.102763 NaN NaN\n", "2013-01-05 0.605311 1.339097 0.313692 1.190966\n", "2013-01-06 NaN 0.818028 NaN 0.305170" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df > 0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pour filter, on utilise la méthode isin()" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " A B C D E\n", "2013-01-01 -1.184356 -0.723850 0.391051 0.749447 one\n", "2013-01-02 0.027502 0.242028 0.937188 0.358945 one\n", "2013-01-03 0.121083 1.633698 -0.134070 -0.047994 two\n", "2013-01-04 1.247574 0.102763 -0.254731 -1.270130 three\n", "2013-01-05 0.605311 1.339097 0.313692 1.190966 four\n", "2013-01-06 -0.582905 0.818028 -0.259498 0.305170 three\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
2013-01-030.1210831.633698-0.134070-0.047994two
2013-01-050.6053111.3390970.3136921.190966four
\n", "
" ], "text/plain": [ " A B C D E\n", "2013-01-03 0.121083 1.633698 -0.134070 -0.047994 two\n", "2013-01-05 0.605311 1.339097 0.313692 1.190966 four" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df.copy()\n", "df2['E'] = ['one', 'one','two','three','four','three']\n", "print(df2)\n", "\n", "df2[df2['E'].isin(['two','four'])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Ajouter / modifier valeurs / colonnes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ajouter une nouvelle colonne automatiquement aligne les données par index." ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2013-01-02 1\n", "2013-01-03 2\n", "2013-01-04 3\n", "2013-01-05 4\n", "2013-01-06 5\n", "2013-01-07 6\n", "Freq: D, dtype: int64\n" ] } ], "source": [ "s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))\n", "print(s1)\n", "df['F'] = s1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modifier une valeur par label" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df.at[dates[0],'A'] = 0" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[dates[0],'A']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modifier une valeur par position" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df.iat[0,1] = 0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modifier une valeur en assignant un tableau numpy" ] }, { "cell_type": "code", "execution_count": 89, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df.loc[:,'D'] = np.array([5] * len(df))" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2013-01-010.0000000.0000000.3910515NaN
2013-01-020.0275020.2420280.93718851.0
2013-01-030.1210831.633698-0.13407052.0
2013-01-041.2475740.102763-0.25473153.0
2013-01-050.6053111.3390970.31369254.0
2013-01-06-0.5829050.818028-0.25949855.0
\n", "
" ], "text/plain": [ " A B C D F\n", "2013-01-01 0.000000 0.000000 0.391051 5 NaN\n", "2013-01-02 0.027502 0.242028 0.937188 5 1.0\n", "2013-01-03 0.121083 1.633698 -0.134070 5 2.0\n", "2013-01-04 1.247574 0.102763 -0.254731 5 3.0\n", "2013-01-05 0.605311 1.339097 0.313692 5 4.0\n", "2013-01-06 -0.582905 0.818028 -0.259498 5 5.0" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Gérer les données manquantes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pandas utilise le type np.nan pour représenter les valeurs manquantes. Ce n'est pas codé pour faire des calculs." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reindex permet de changer/ajouter/supprimer les index d'un axe. Cette fonction retourne une copie des données" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDFE
2013-01-010.0000000.0000000.3910515NaN1.0
2013-01-020.0275020.2420280.93718851.01.0
2013-01-030.1210831.633698-0.13407052.0NaN
2013-01-041.2475740.102763-0.25473153.0NaN
\n", "
" ], "text/plain": [ " A B C D F E\n", "2013-01-01 0.000000 0.000000 0.391051 5 NaN 1.0\n", "2013-01-02 0.027502 0.242028 0.937188 5 1.0 1.0\n", "2013-01-03 0.121083 1.633698 -0.134070 5 2.0 NaN\n", "2013-01-04 1.247574 0.102763 -0.254731 5 3.0 NaN" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])\n", "df1.loc[dates[0]:dates[1],'E'] = 1\n", "df1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pour supprimer les lignes contenant des NaN :" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDFE
2013-01-020.0275020.2420280.93718851.01.0
\n", "
" ], "text/plain": [ " A B C D F E\n", "2013-01-02 0.027502 0.242028 0.937188 5 1.0 1.0" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.dropna(how='any')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Remplacement des valeurs manquantes" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDFE
2013-01-010.0000000.0000000.39105155.01.0
2013-01-020.0275020.2420280.93718851.01.0
2013-01-030.1210831.633698-0.13407052.05.0
2013-01-041.2475740.102763-0.25473153.05.0
\n", "
" ], "text/plain": [ " A B C D F E\n", "2013-01-01 0.000000 0.000000 0.391051 5 5.0 1.0\n", "2013-01-02 0.027502 0.242028 0.937188 5 1.0 1.0\n", "2013-01-03 0.121083 1.633698 -0.134070 5 2.0 5.0\n", "2013-01-04 1.247574 0.102763 -0.254731 5 3.0 5.0" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.fillna(value=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Obtenir le masque de booléen de l'emplacement des nan" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDFE
2013-01-01FalseFalseFalseFalseTrueFalse
2013-01-02FalseFalseFalseFalseFalseFalse
2013-01-03FalseFalseFalseFalseFalseTrue
2013-01-04FalseFalseFalseFalseFalseTrue
\n", "
" ], "text/plain": [ " A B C D F E\n", "2013-01-01 False False False False True False\n", "2013-01-02 False False False False False False\n", "2013-01-03 False False False False False True\n", "2013-01-04 False False False False False True" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.isnull(df1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Opérations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Stats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Les opérations excluent généralement les données manquantes." ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A 0.236427\n", "B 0.689269\n", "C 0.165605\n", "D 5.000000\n", "F 3.000000\n", "dtype: float64\n", "2013-01-01 1.347763\n", "2013-01-02 1.441344\n", "2013-01-03 1.724142\n", "2013-01-04 1.819121\n", "2013-01-05 2.251620\n", "2013-01-06 1.995125\n", "Freq: D, dtype: float64\n" ] } ], "source": [ "print(df.mean())\n", "print(df.mean(1)) #Autre axe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Situation avec des objets de dimmension différentes. En plus, pandas va automatiquement étendre la donnée sur la dimension spécifiée" ] }, { "cell_type": "code", "execution_count": 96, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2013-01-01 NaN\n", "2013-01-02 NaN\n", "2013-01-03 1.0\n", "2013-01-04 3.0\n", "2013-01-05 5.0\n", "2013-01-06 NaN\n", "Freq: D, dtype: float64\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2013-01-01NaNNaNNaNNaNNaN
2013-01-02NaNNaNNaNNaNNaN
2013-01-03-0.8789170.633698-1.1340704.01.0
2013-01-04-1.752426-2.897237-3.2547312.00.0
2013-01-05-4.394689-3.660903-4.6863080.0-1.0
2013-01-06NaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " A B C D F\n", "2013-01-01 NaN NaN NaN NaN NaN\n", "2013-01-02 NaN NaN NaN NaN NaN\n", "2013-01-03 -0.878917 0.633698 -1.134070 4.0 1.0\n", "2013-01-04 -1.752426 -2.897237 -3.254731 2.0 0.0\n", "2013-01-05 -4.394689 -3.660903 -4.686308 0.0 -1.0\n", "2013-01-06 NaN NaN NaN NaN NaN" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)\n", "\n", "print(s)\n", "\n", "df.sub(s, axis='index')" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on method sub in module pandas.core.ops:\n", "\n", "sub(other, axis='columns', level=None, fill_value=None) method of pandas.core.frame.DataFrame instance\n", " Get Subtraction of dataframe and other, element-wise (binary operator `sub`).\n", " \n", " Equivalent to ``dataframe - other``, but with support to substitute a fill_value\n", " for missing data in one of the inputs. With reverse version, `rsub`.\n", " \n", " Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to\n", " arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`.\n", " \n", " Parameters\n", " ----------\n", " other : scalar, sequence, Series, or DataFrame\n", " Any single or multiple element data structure, or list-like object.\n", " axis : {0 or 'index', 1 or 'columns'}\n", " Whether to compare by the index (0 or 'index') or columns\n", " (1 or 'columns'). For Series input, axis to match Series index on.\n", " level : int or label\n", " Broadcast across a level, matching Index values on the\n", " passed MultiIndex level.\n", " fill_value : float or None, default None\n", " Fill existing missing (NaN) values, and any new element needed for\n", " successful DataFrame alignment, with this value before computation.\n", " If data in both corresponding DataFrame locations is missing\n", " the result will be missing.\n", " \n", " Returns\n", " -------\n", " DataFrame\n", " Result of the arithmetic operation.\n", " \n", " See Also\n", " --------\n", " DataFrame.add : Add DataFrames.\n", " DataFrame.sub : Subtract DataFrames.\n", " DataFrame.mul : Multiply DataFrames.\n", " DataFrame.div : Divide DataFrames (float division).\n", " DataFrame.truediv : Divide DataFrames (float division).\n", " DataFrame.floordiv : Divide DataFrames (integer division).\n", " DataFrame.mod : Calculate modulo (remainder after division).\n", " DataFrame.pow : Calculate exponential power.\n", " \n", " Notes\n", " -----\n", " Mismatched indices will be unioned together.\n", " \n", " Examples\n", " --------\n", " >>> df = pd.DataFrame({'angles': [0, 3, 4],\n", " ... 'degrees': [360, 180, 360]},\n", " ... index=['circle', 'triangle', 'rectangle'])\n", " >>> df\n", " angles degrees\n", " circle 0 360\n", " triangle 3 180\n", " rectangle 4 360\n", " \n", " Add a scalar with operator version which return the same\n", " results.\n", " \n", " >>> df + 1\n", " angles degrees\n", " circle 1 361\n", " triangle 4 181\n", " rectangle 5 361\n", " \n", " >>> df.add(1)\n", " angles degrees\n", " circle 1 361\n", " triangle 4 181\n", " rectangle 5 361\n", " \n", " Divide by constant with reverse version.\n", " \n", " >>> df.div(10)\n", " angles degrees\n", " circle 0.0 36.0\n", " triangle 0.3 18.0\n", " rectangle 0.4 36.0\n", " \n", " >>> df.rdiv(10)\n", " angles degrees\n", " circle inf 0.027778\n", " triangle 3.333333 0.055556\n", " rectangle 2.500000 0.027778\n", " \n", " Subtract a list and Series by axis with operator version.\n", " \n", " >>> df - [1, 2]\n", " angles degrees\n", " circle -1 358\n", " triangle 2 178\n", " rectangle 3 358\n", " \n", " >>> df.sub([1, 2], axis='columns')\n", " angles degrees\n", " circle -1 358\n", " triangle 2 178\n", " rectangle 3 358\n", " \n", " >>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),\n", " ... axis='index')\n", " angles degrees\n", " circle -1 359\n", " triangle 2 179\n", " rectangle 3 359\n", " \n", " Multiply a DataFrame of different shape with operator version.\n", " \n", " >>> other = pd.DataFrame({'angles': [0, 3, 4]},\n", " ... index=['circle', 'triangle', 'rectangle'])\n", " >>> other\n", " angles\n", " circle 0\n", " triangle 3\n", " rectangle 4\n", " \n", " >>> df * other\n", " angles degrees\n", " circle 0 NaN\n", " triangle 9 NaN\n", " rectangle 16 NaN\n", " \n", " >>> df.mul(other, fill_value=0)\n", " angles degrees\n", " circle 0 0.0\n", " triangle 9 0.0\n", " rectangle 16 0.0\n", " \n", " Divide by a MultiIndex by level.\n", " \n", " >>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],\n", " ... 'degrees': [360, 180, 360, 360, 540, 720]},\n", " ... index=[['A', 'A', 'A', 'B', 'B', 'B'],\n", " ... ['circle', 'triangle', 'rectangle',\n", " ... 'square', 'pentagon', 'hexagon']])\n", " >>> df_multindex\n", " angles degrees\n", " A circle 0 360\n", " triangle 3 180\n", " rectangle 4 360\n", " B square 4 360\n", " pentagon 5 540\n", " hexagon 6 720\n", " \n", " >>> df.div(df_multindex, level=1, fill_value=0)\n", " angles degrees\n", " A circle NaN 1.0\n", " triangle 1.0 1.0\n", " rectangle 1.0 1.0\n", " B square 0.0 0.0\n", " pentagon 0.0 0.0\n", " hexagon 0.0 0.0\n", "\n" ] } ], "source": [ "help(df.sub)" ] }, { "cell_type": "code", "execution_count": 98, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2013-01-01NaNNaNNaNNaNNaN
2013-01-02NaNNaNNaNNaNNaN
2013-01-03-0.8789170.633698-1.1340704.01.0
2013-01-04-1.752426-2.897237-3.2547312.00.0
2013-01-05-4.394689-3.660903-4.6863080.0-1.0
2013-01-06NaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " A B C D F\n", "2013-01-01 NaN NaN NaN NaN NaN\n", "2013-01-02 NaN NaN NaN NaN NaN\n", "2013-01-03 -0.878917 0.633698 -1.134070 4.0 1.0\n", "2013-01-04 -1.752426 -2.897237 -3.254731 2.0 0.0\n", "2013-01-05 -4.394689 -3.660903 -4.686308 0.0 -1.0\n", "2013-01-06 NaN NaN NaN NaN NaN" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sub(s, axis='index')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Apply" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Appliquer des foncitons aux données" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDF
2013-01-010.0000000.0000000.3910515NaN
2013-01-020.0275020.2420281.328239101.0
2013-01-030.1485851.8757261.194169153.0
2013-01-041.3961591.9784890.939438206.0
2013-01-052.0014703.3175851.2531292510.0
2013-01-061.4185654.1356130.9936323015.0
\n", "
" ], "text/plain": [ " A B C D F\n", "2013-01-01 0.000000 0.000000 0.391051 5 NaN\n", "2013-01-02 0.027502 0.242028 1.328239 10 1.0\n", "2013-01-03 0.148585 1.875726 1.194169 15 3.0\n", "2013-01-04 1.396159 1.978489 0.939438 20 6.0\n", "2013-01-05 2.001470 3.317585 1.253129 25 10.0\n", "2013-01-06 1.418565 4.135613 0.993632 30 15.0" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.apply(np.cumsum)" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "A 1.830479\n", "B 1.633698\n", "C 1.196686\n", "D 0.000000\n", "F 4.000000\n", "dtype: float64" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.apply((lambda x: x.max() - x.min()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Histogramme" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 3\n", "1 2\n", "2 3\n", "3 3\n", "4 5\n", "5 3\n", "6 4\n", "7 0\n", "8 1\n", "9 2\n", "dtype: int64\n", "3 4\n", "2 2\n", "0 1\n", "1 1\n", "4 1\n", "5 1\n", "dtype: int64\n" ] } ], "source": [ "s = pd.Series(np.random.randint(0, 7, size=10))\n", "print(s)\n", "print(s.value_counts())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Methodes String" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Les séries sont équipées de méthodes pour traiter les strings avec l'attribut str qui rend facile la manipulation de chaque élémen d'un tableau. On utilise régulièrement des expressions régulières." ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 a\n", "1 b\n", "2 c\n", "3 aaba\n", "4 baca\n", "5 NaN\n", "6 caba\n", "7 dog\n", "8 cat\n", "dtype: object" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])\n", "s.str.lower()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Regrouper" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Concaténation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pandas fournit des methodes pour facilement combiner des Series, DatFrame et des Panel objets avec des types variés de set logique pour les indexes et des fonctionnalités d'algèbre dans le cas de jointure / regroupement" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On peut concaténer des objets pandas avec concat()" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 1 2 3\n", "0 0.069013 -0.040383 -0.607478 -1.286906\n", "1 1.270398 -0.523822 0.858627 0.751044\n", "2 1.480067 -0.005535 0.244562 -0.224817\n", "3 -0.730252 0.188720 0.105386 -0.600060\n", "4 -0.844170 1.736848 0.190962 0.451052\n", "5 0.181550 0.136675 0.892205 0.701368\n", "6 -0.485632 -0.894081 0.532846 -1.228718\n", "7 0.324976 -1.670890 -0.082170 -0.383374\n", "8 0.589956 -0.859584 0.737789 1.193208\n", "9 1.124989 -1.301511 2.010070 0.593592\n" ] } ], "source": [ "df = pd.DataFrame(np.random.randn(10, 4))\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pieces = [df[:3], df[3:7], df[7:]]" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
00.069013-0.040383-0.607478-1.286906
11.270398-0.5238220.8586270.751044
21.480067-0.0055350.244562-0.224817
3-0.7302520.1887200.105386-0.600060
4-0.8441701.7368480.1909620.451052
50.1815500.1366750.8922050.701368
6-0.485632-0.8940810.532846-1.228718
70.324976-1.670890-0.082170-0.383374
80.589956-0.8595840.7377891.193208
91.124989-1.3015112.0100700.593592
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "0 0.069013 -0.040383 -0.607478 -1.286906\n", "1 1.270398 -0.523822 0.858627 0.751044\n", "2 1.480067 -0.005535 0.244562 -0.224817\n", "3 -0.730252 0.188720 0.105386 -0.600060\n", "4 -0.844170 1.736848 0.190962 0.451052\n", "5 0.181550 0.136675 0.892205 0.701368\n", "6 -0.485632 -0.894081 0.532846 -1.228718\n", "7 0.324976 -1.670890 -0.082170 -0.383374\n", "8 0.589956 -0.859584 0.737789 1.193208\n", "9 1.124989 -1.301511 2.010070 0.593592" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(pieces)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Jointures" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On peut merger à la manière de requete SQL." ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " key lval\n", "0 foo 1\n", "1 foo 2\n", " key rval\n", "0 foo 4\n", "1 foo 5\n", " key lval rval\n", "0 foo 1 4\n", "1 foo 1 5\n", "2 foo 2 4\n", "3 foo 2 5\n" ] } ], "source": [ "left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})\n", "right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})\n", "\n", "print(left)\n", "print(right)\n", "\n", "print(pd.merge(left, right, on='key'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Append" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
0-1.0804560.9817860.8800261.144814
1-1.665425-0.5770141.265051-0.515127
20.342626-0.6351121.5587371.070910
3-0.236688-1.1781350.3719700.268590
4-0.4783430.0153310.8357940.553637
5-1.115995-0.086228-0.959499-1.168225
60.615278-0.550461-0.3280220.753208
7-0.3083741.024291-1.4375371.189244
\n", "
" ], "text/plain": [ " A B C D\n", "0 -1.080456 0.981786 0.880026 1.144814\n", "1 -1.665425 -0.577014 1.265051 -0.515127\n", "2 0.342626 -0.635112 1.558737 1.070910\n", "3 -0.236688 -1.178135 0.371970 0.268590\n", "4 -0.478343 0.015331 0.835794 0.553637\n", "5 -1.115995 -0.086228 -0.959499 -1.168225\n", "6 0.615278 -0.550461 -0.328022 0.753208\n", "7 -0.308374 1.024291 -1.437537 1.189244" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])\n", "df" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
0-1.0804560.9817860.8800261.144814
1-1.665425-0.5770141.265051-0.515127
20.342626-0.6351121.5587371.070910
3-0.236688-1.1781350.3719700.268590
4-0.4783430.0153310.8357940.553637
5-1.115995-0.086228-0.959499-1.168225
60.615278-0.550461-0.3280220.753208
7-0.3083741.024291-1.4375371.189244
8-0.236688-1.1781350.3719700.268590
\n", "
" ], "text/plain": [ " A B C D\n", "0 -1.080456 0.981786 0.880026 1.144814\n", "1 -1.665425 -0.577014 1.265051 -0.515127\n", "2 0.342626 -0.635112 1.558737 1.070910\n", "3 -0.236688 -1.178135 0.371970 0.268590\n", "4 -0.478343 0.015331 0.835794 0.553637\n", "5 -1.115995 -0.086228 -0.959499 -1.168225\n", "6 0.615278 -0.550461 -0.328022 0.753208\n", "7 -0.308374 1.024291 -1.437537 1.189244\n", "8 -0.236688 -1.178135 0.371970 0.268590" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = df.iloc[3]\n", "df.append(s, ignore_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Groupement" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Le regroupement comprend les étapes suivantes :\n", " * Séparation de la donnée en groupes\n", " * Appliquer une fonction a chaque group indépendamment\n", " * Combiner les resultats dans une structure de données" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',\n", "'foo', 'bar', 'foo', 'foo'],\n", "'B' : ['one', 'one', 'two', 'three',\n", "'two', 'two', 'one', 'three'],\n", "'C' : np.random.randn(8),\n", "'D' : np.random.randn(8)})" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
0fooone0.4384430.479237
1barone-0.457745-0.363322
2footwo0.684435-0.421288
3barthree-0.6202410.267481
4footwo-0.0325912.801863
5bartwo0.878103-0.064529
6fooone-0.000896-1.695125
7foothree1.341618-0.165415
\n", "
" ], "text/plain": [ " A B C D\n", "0 foo one 0.438443 0.479237\n", "1 bar one -0.457745 -0.363322\n", "2 foo two 0.684435 -0.421288\n", "3 bar three -0.620241 0.267481\n", "4 foo two -0.032591 2.801863\n", "5 bar two 0.878103 -0.064529\n", "6 foo one -0.000896 -1.695125\n", "7 foo three 1.341618 -0.165415" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Groupement et somme des groupes" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CD
A
bar-0.199883-0.160370
foo2.4310110.999271
\n", "
" ], "text/plain": [ " C D\n", "A \n", "bar -0.199883 -0.160370\n", "foo 2.431011 0.999271" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('A').sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Groupement de multiple colonnes" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CD
AB
barone-0.457745-0.363322
three-0.6202410.267481
two0.878103-0.064529
fooone0.437547-1.215888
three1.341618-0.165415
two0.6518452.380574
\n", "
" ], "text/plain": [ " C D\n", "A B \n", "bar one -0.457745 -0.363322\n", " three -0.620241 0.267481\n", " two 0.878103 -0.064529\n", "foo one 0.437547 -1.215888\n", " three 1.341618 -0.165415\n", " two 0.651845 2.380574" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(['A','B']).sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Reformation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Stack" ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
firstsecond
barone-0.347244-0.780456
two-0.394013-0.561465
bazone1.0553222.620028
two-1.4434170.514155
\n", "
" ], "text/plain": [ " A B\n", "first second \n", "bar one -0.347244 -0.780456\n", " two -0.394013 -0.561465\n", "baz one 1.055322 2.620028\n", " two -1.443417 0.514155" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',\n", "'foo', 'foo', 'qux', 'qux'],\n", "['one', 'two', 'one', 'two',\n", "'one', 'two', 'one', 'two']]))\n", "\n", "index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])\n", "df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])\n", "df2 = df[:4]\n", "df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "La méthode stack() compresses un level dans les colonnes du dataframe\n" ] }, { "cell_type": "code", "execution_count": 114, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "first second \n", "bar one A -0.347244\n", " B -0.780456\n", " two A -0.394013\n", " B -0.561465\n", "baz one A 1.055322\n", " B 2.620028\n", " two A -1.443417\n", " B 0.514155\n", "dtype: float64" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stacked = df2.stack()\n", "stacked" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Avec une 'stacked' dataframe ou série, l'opération inverse est unstack()" ] } ], "metadata": { "anaconda-cloud": {}, "interpreter": { "hash": "40d3a090f54c6569ab1632332b64b2c03c39dcf918b08424e98f38b5ae0af88f" }, "kernelspec": { "display_name": "Python [Root]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 1 }