{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pandas en 10 minutes"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
""
],
"text/plain": [
""
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
" #Pour intégrer les graphes à votre notebook, il suffit de faire\n",
"%matplotlib inline\n",
"\n",
"from jyquickhelper import add_notebook_menu\n",
"add_notebook_menu()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On importe générallement les librairies suivantes"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Création d'objets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On créé une 'Series' en lui passant une liste de valeurs, en laissant pandas créer un index d'entiers"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 3.0\n",
"2 5.0\n",
"3 NaN\n",
"4 6.0\n",
"5 8.0\n",
"dtype: float64"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s = pd.Series([1,3,5,np.nan,6,8])\n",
"s"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On créé un DataFrame en passant un array numpy, avec un index sur sur une date et des colonnes labellisées"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',\n",
" '2013-01-05', '2013-01-06'],\n",
" dtype='datetime64[ns]', freq='D')\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" -1.184356 | \n",
" -0.723850 | \n",
" 0.391051 | \n",
" 0.749447 | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
" 0.937188 | \n",
" 0.358945 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
" -0.134070 | \n",
" -0.047994 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" 1.247574 | \n",
" 0.102763 | \n",
" -0.254731 | \n",
" -1.270130 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 0.605311 | \n",
" 1.339097 | \n",
" 0.313692 | \n",
" 1.190966 | \n",
"
\n",
" \n",
" 2013-01-06 | \n",
" -0.582905 | \n",
" 0.818028 | \n",
" -0.259498 | \n",
" 0.305170 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D\n",
"2013-01-01 -1.184356 -0.723850 0.391051 0.749447\n",
"2013-01-02 0.027502 0.242028 0.937188 0.358945\n",
"2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n",
"2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n",
"2013-01-05 0.605311 1.339097 0.313692 1.190966\n",
"2013-01-06 -0.582905 0.818028 -0.259498 0.305170"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dates = pd.date_range('20130101', periods=6)\n",
"print(dates)\n",
"\n",
"df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On peut également créer un DataFrame en passant un dictionnaire d'objets qui peut être converti en sorte de série."
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" E | \n",
" F | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 2013-01-02 | \n",
" 1.0 | \n",
" 3 | \n",
" test | \n",
" foo | \n",
"
\n",
" \n",
" 1 | \n",
" 1.0 | \n",
" 2013-01-02 | \n",
" 1.0 | \n",
" 3 | \n",
" train | \n",
" foo | \n",
"
\n",
" \n",
" 2 | \n",
" 1.0 | \n",
" 2013-01-02 | \n",
" 1.0 | \n",
" 3 | \n",
" test | \n",
" foo | \n",
"
\n",
" \n",
" 3 | \n",
" 1.0 | \n",
" 2013-01-02 | \n",
" 1.0 | \n",
" 3 | \n",
" train | \n",
" foo | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D E F\n",
"0 1.0 2013-01-02 1.0 3 test foo\n",
"1 1.0 2013-01-02 1.0 3 train foo\n",
"2 1.0 2013-01-02 1.0 3 test foo\n",
"3 1.0 2013-01-02 1.0 3 train foo"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = pd.DataFrame({ 'A' : 1.,\n",
"'B' : pd.Timestamp('20130102'),\n",
"'C' : pd.Series(1,index=list(range(4)),dtype='float32'),\n",
"'D' : np.array([3] * 4,dtype='int32'),\n",
"'E' : pd.Categorical([\"test\",\"train\",\"test\",\"train\"]),\n",
"'F' : 'foo' })\n",
"\n",
"df2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Chaque colonne a son propre dtypes"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"A float64\n",
"B datetime64[ns]\n",
"C float32\n",
"D int32\n",
"E category\n",
"F object\n",
"dtype: object"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On peut afficher les premières lignes et les dernières"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" A B C D\n",
"2013-01-01 -1.184356 -0.723850 0.391051 0.749447\n",
"2013-01-02 0.027502 0.242028 0.937188 0.358945\n",
"2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n",
"2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n",
"2013-01-05 0.605311 1.339097 0.313692 1.190966\n",
" A B C D\n",
"2013-01-02 0.027502 0.242028 0.937188 0.358945\n",
"2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n",
"2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n",
"2013-01-05 0.605311 1.339097 0.313692 1.190966\n",
"2013-01-06 -0.582905 0.818028 -0.259498 0.305170\n"
]
}
],
"source": [
"print(df.head())\n",
"print(df.tail())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On peut afficher l'index, les colonnes et les données numpy"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',\n",
" '2013-01-05', '2013-01-06'],\n",
" dtype='datetime64[ns]', freq='D')\n",
"Index(['A', 'B', 'C', 'D'], dtype='object')\n",
"[[-1.18435607 -0.72384974 0.39105091 0.74944703]\n",
" [ 0.02750198 0.24202818 0.93718807 0.35894525]\n",
" [ 0.12108336 1.63369769 -0.13407011 -0.04799366]\n",
" [ 1.24757372 0.10276272 -0.25473099 -1.27013015]\n",
" [ 0.60531084 1.33909665 0.3136916 1.19096551]\n",
" [-0.58290532 0.81802772 -0.25949756 0.30516996]]\n"
]
}
],
"source": [
"print(df.index)\n",
"print(df.columns)\n",
"print(df.values)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"La méthode describe permet d'afficher un résumé des données"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 6.000000 | \n",
" 6.000000 | \n",
" 6.000000 | \n",
" 6.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 0.039035 | \n",
" 0.568627 | \n",
" 0.165605 | \n",
" 0.214401 | \n",
"
\n",
" \n",
" std | \n",
" 0.856924 | \n",
" 0.869988 | \n",
" 0.472291 | \n",
" 0.841637 | \n",
"
\n",
" \n",
" min | \n",
" -1.184356 | \n",
" -0.723850 | \n",
" -0.259498 | \n",
" -1.270130 | \n",
"
\n",
" \n",
" 25% | \n",
" -0.430303 | \n",
" 0.137579 | \n",
" -0.224566 | \n",
" 0.040297 | \n",
"
\n",
" \n",
" 50% | \n",
" 0.074293 | \n",
" 0.530028 | \n",
" 0.089811 | \n",
" 0.332058 | \n",
"
\n",
" \n",
" 75% | \n",
" 0.484254 | \n",
" 1.208829 | \n",
" 0.371711 | \n",
" 0.651822 | \n",
"
\n",
" \n",
" max | \n",
" 1.247574 | \n",
" 1.633698 | \n",
" 0.937188 | \n",
" 1.190966 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D\n",
"count 6.000000 6.000000 6.000000 6.000000\n",
"mean 0.039035 0.568627 0.165605 0.214401\n",
"std 0.856924 0.869988 0.472291 0.841637\n",
"min -1.184356 -0.723850 -0.259498 -1.270130\n",
"25% -0.430303 0.137579 -0.224566 0.040297\n",
"50% 0.074293 0.530028 0.089811 0.332058\n",
"75% 0.484254 1.208829 0.371711 0.651822\n",
"max 1.247574 1.633698 0.937188 1.190966"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On peut faire la transposée, trier en fonction d'un axe ou des valeurs"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06\n",
"A -1.184356 0.027502 0.121083 1.247574 0.605311 -0.582905\n",
"B -0.723850 0.242028 1.633698 0.102763 1.339097 0.818028\n",
"C 0.391051 0.937188 -0.134070 -0.254731 0.313692 -0.259498\n",
"D 0.749447 0.358945 -0.047994 -1.270130 1.190966 0.305170\n"
]
}
],
"source": [
"print(df.T)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" D | \n",
" C | \n",
" B | \n",
" A | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" 0.749447 | \n",
" 0.391051 | \n",
" -0.723850 | \n",
" -1.184356 | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" 0.358945 | \n",
" 0.937188 | \n",
" 0.242028 | \n",
" 0.027502 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" -0.047994 | \n",
" -0.134070 | \n",
" 1.633698 | \n",
" 0.121083 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" -1.270130 | \n",
" -0.254731 | \n",
" 0.102763 | \n",
" 1.247574 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 1.190966 | \n",
" 0.313692 | \n",
" 1.339097 | \n",
" 0.605311 | \n",
"
\n",
" \n",
" 2013-01-06 | \n",
" 0.305170 | \n",
" -0.259498 | \n",
" 0.818028 | \n",
" -0.582905 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" D C B A\n",
"2013-01-01 0.749447 0.391051 -0.723850 -1.184356\n",
"2013-01-02 0.358945 0.937188 0.242028 0.027502\n",
"2013-01-03 -0.047994 -0.134070 1.633698 0.121083\n",
"2013-01-04 -1.270130 -0.254731 0.102763 1.247574\n",
"2013-01-05 1.190966 0.313692 1.339097 0.605311\n",
"2013-01-06 0.305170 -0.259498 0.818028 -0.582905"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_index(axis=1, ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" -1.184356 | \n",
" -0.723850 | \n",
" 0.391051 | \n",
" 0.749447 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" 1.247574 | \n",
" 0.102763 | \n",
" -0.254731 | \n",
" -1.270130 | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
" 0.937188 | \n",
" 0.358945 | \n",
"
\n",
" \n",
" 2013-01-06 | \n",
" -0.582905 | \n",
" 0.818028 | \n",
" -0.259498 | \n",
" 0.305170 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 0.605311 | \n",
" 1.339097 | \n",
" 0.313692 | \n",
" 1.190966 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
" -0.134070 | \n",
" -0.047994 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D\n",
"2013-01-01 -1.184356 -0.723850 0.391051 0.749447\n",
"2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n",
"2013-01-02 0.027502 0.242028 0.937188 0.358945\n",
"2013-01-06 -0.582905 0.818028 -0.259498 0.305170\n",
"2013-01-05 0.605311 1.339097 0.313692 1.190966\n",
"2013-01-03 0.121083 1.633698 -0.134070 -0.047994"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values(by='B')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Selection des données"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Getting"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Selection d'une colonne (équivalent à df.A)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2013-01-01 -1.184356\n",
"2013-01-02 0.027502\n",
"2013-01-03 0.121083\n",
"2013-01-04 1.247574\n",
"2013-01-05 0.605311\n",
"2013-01-06 -0.582905\n",
"Freq: D, Name: A, dtype: float64\n",
" A B C D\n",
"2013-01-01 -1.184356 -0.723850 0.391051 0.749447\n",
"2013-01-02 0.027502 0.242028 0.937188 0.358945\n",
"2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n",
" A B C D\n",
"2013-01-02 0.027502 0.242028 0.937188 0.358945\n",
"2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n",
"2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n"
]
}
],
"source": [
"print(df['A'])\n",
"print(df[0:3])\n",
"print(df['20130102':'20130104'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Selection par Label"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"En utilisant un label"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"A -1.184356\n",
"B -0.723850\n",
"C 0.391051\n",
"D 0.749447\n",
"Name: 2013-01-01 00:00:00, dtype: float64"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[dates[0]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Selection de plusieurs axes par label"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" -1.184356 | \n",
" -0.723850 | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" 1.247574 | \n",
" 0.102763 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 0.605311 | \n",
" 1.339097 | \n",
"
\n",
" \n",
" 2013-01-06 | \n",
" -0.582905 | \n",
" 0.818028 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"2013-01-01 -1.184356 -0.723850\n",
"2013-01-02 0.027502 0.242028\n",
"2013-01-03 0.121083 1.633698\n",
"2013-01-04 1.247574 0.102763\n",
"2013-01-05 0.605311 1.339097\n",
"2013-01-06 -0.582905 0.818028"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[:,['A','B']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Avec le label slicing, les deux points de terminaisons sont INCLUS"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" 1.247574 | \n",
" 0.102763 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"2013-01-02 0.027502 0.242028\n",
"2013-01-03 0.121083 1.633698\n",
"2013-01-04 1.247574 0.102763"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc['20130102':'20130104', ['A','B']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Obtenir une valeur scalaire"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"-1.184356071097634"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[dates[0],'A']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Acces plus rapide (méthode équivalente à la précédente)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"-1.184356071097634"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.at[dates[0],'A']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Selection par position"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Integer : "
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"A 1.247574\n",
"B 0.102763\n",
"C -0.254731\n",
"D -1.270130\n",
"Name: 2013-01-04 00:00:00, dtype: float64"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[3]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Tranches d'entiers, similaire à numpy"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-04 | \n",
" 1.247574 | \n",
" 0.102763 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 0.605311 | \n",
" 1.339097 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"2013-01-04 1.247574 0.102763\n",
"2013-01-05 0.605311 1.339097"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[3:5,0:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Par liste d'entiers"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" C | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.937188 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" -0.134070 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 0.605311 | \n",
" 0.313692 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A C\n",
"2013-01-02 0.027502 0.937188\n",
"2013-01-03 0.121083 -0.134070\n",
"2013-01-05 0.605311 0.313692"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[[1,2,4],[0,2]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Découpage de ligne explicite"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
" 0.937188 | \n",
" 0.358945 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
" -0.134070 | \n",
" -0.047994 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D\n",
"2013-01-02 0.027502 0.242028 0.937188 0.358945\n",
"2013-01-03 0.121083 1.633698 -0.134070 -0.047994"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[1:3,:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Obtenir une valeur explicitement"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.24202817975998675"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[1,1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Acces rapide au scalaire"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.24202817975998675"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iat[1,1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Indexation booléenne"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"En utilisant une valeur sur une colonne : "
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
" 0.937188 | \n",
" 0.358945 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
" -0.134070 | \n",
" -0.047994 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" 1.247574 | \n",
" 0.102763 | \n",
" -0.254731 | \n",
" -1.270130 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 0.605311 | \n",
" 1.339097 | \n",
" 0.313692 | \n",
" 1.190966 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D\n",
"2013-01-02 0.027502 0.242028 0.937188 0.358945\n",
"2013-01-03 0.121083 1.633698 -0.134070 -0.047994\n",
"2013-01-04 1.247574 0.102763 -0.254731 -1.270130\n",
"2013-01-05 0.605311 1.339097 0.313692 1.190966"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.A > 0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Opérateur where : "
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" NaN | \n",
" NaN | \n",
" 0.391051 | \n",
" 0.749447 | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
" 0.937188 | \n",
" 0.358945 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" 1.247574 | \n",
" 0.102763 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 0.605311 | \n",
" 1.339097 | \n",
" 0.313692 | \n",
" 1.190966 | \n",
"
\n",
" \n",
" 2013-01-06 | \n",
" NaN | \n",
" 0.818028 | \n",
" NaN | \n",
" 0.305170 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D\n",
"2013-01-01 NaN NaN 0.391051 0.749447\n",
"2013-01-02 0.027502 0.242028 0.937188 0.358945\n",
"2013-01-03 0.121083 1.633698 NaN NaN\n",
"2013-01-04 1.247574 0.102763 NaN NaN\n",
"2013-01-05 0.605311 1.339097 0.313692 1.190966\n",
"2013-01-06 NaN 0.818028 NaN 0.305170"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df > 0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Pour filter, on utilise la méthode isin()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" A B C D E\n",
"2013-01-01 -1.184356 -0.723850 0.391051 0.749447 one\n",
"2013-01-02 0.027502 0.242028 0.937188 0.358945 one\n",
"2013-01-03 0.121083 1.633698 -0.134070 -0.047994 two\n",
"2013-01-04 1.247574 0.102763 -0.254731 -1.270130 three\n",
"2013-01-05 0.605311 1.339097 0.313692 1.190966 four\n",
"2013-01-06 -0.582905 0.818028 -0.259498 0.305170 three\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" E | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
" -0.134070 | \n",
" -0.047994 | \n",
" two | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 0.605311 | \n",
" 1.339097 | \n",
" 0.313692 | \n",
" 1.190966 | \n",
" four | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D E\n",
"2013-01-03 0.121083 1.633698 -0.134070 -0.047994 two\n",
"2013-01-05 0.605311 1.339097 0.313692 1.190966 four"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = df.copy()\n",
"df2['E'] = ['one', 'one','two','three','four','three']\n",
"print(df2)\n",
"\n",
"df2[df2['E'].isin(['two','four'])]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Ajouter / modifier valeurs / colonnes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ajouter une nouvelle colonne automatiquement aligne les données par index."
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2013-01-02 1\n",
"2013-01-03 2\n",
"2013-01-04 3\n",
"2013-01-05 4\n",
"2013-01-06 5\n",
"2013-01-07 6\n",
"Freq: D, dtype: int64\n"
]
}
],
"source": [
"s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))\n",
"print(s1)\n",
"df['F'] = s1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Modifier une valeur par label"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.at[dates[0],'A'] = 0"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.0"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[dates[0],'A']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Modifier une valeur par position"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.iat[0,1] = 0"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Modifier une valeur en assignant un tableau numpy"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.loc[:,'D'] = np.array([5] * len(df))"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" F | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.391051 | \n",
" 5 | \n",
" NaN | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
" 0.937188 | \n",
" 5 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
" -0.134070 | \n",
" 5 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" 1.247574 | \n",
" 0.102763 | \n",
" -0.254731 | \n",
" 5 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 0.605311 | \n",
" 1.339097 | \n",
" 0.313692 | \n",
" 5 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 2013-01-06 | \n",
" -0.582905 | \n",
" 0.818028 | \n",
" -0.259498 | \n",
" 5 | \n",
" 5.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D F\n",
"2013-01-01 0.000000 0.000000 0.391051 5 NaN\n",
"2013-01-02 0.027502 0.242028 0.937188 5 1.0\n",
"2013-01-03 0.121083 1.633698 -0.134070 5 2.0\n",
"2013-01-04 1.247574 0.102763 -0.254731 5 3.0\n",
"2013-01-05 0.605311 1.339097 0.313692 5 4.0\n",
"2013-01-06 -0.582905 0.818028 -0.259498 5 5.0"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Gérer les données manquantes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Pandas utilise le type np.nan pour représenter les valeurs manquantes. Ce n'est pas codé pour faire des calculs."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Reindex permet de changer/ajouter/supprimer les index d'un axe. Cette fonction retourne une copie des données"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" F | \n",
" E | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.391051 | \n",
" 5 | \n",
" NaN | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
" 0.937188 | \n",
" 5 | \n",
" 1.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
" -0.134070 | \n",
" 5 | \n",
" 2.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" 1.247574 | \n",
" 0.102763 | \n",
" -0.254731 | \n",
" 5 | \n",
" 3.0 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D F E\n",
"2013-01-01 0.000000 0.000000 0.391051 5 NaN 1.0\n",
"2013-01-02 0.027502 0.242028 0.937188 5 1.0 1.0\n",
"2013-01-03 0.121083 1.633698 -0.134070 5 2.0 NaN\n",
"2013-01-04 1.247574 0.102763 -0.254731 5 3.0 NaN"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])\n",
"df1.loc[dates[0]:dates[1],'E'] = 1\n",
"df1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Pour supprimer les lignes contenant des NaN :"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" F | \n",
" E | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
" 0.937188 | \n",
" 5 | \n",
" 1.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D F E\n",
"2013-01-02 0.027502 0.242028 0.937188 5 1.0 1.0"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.dropna(how='any')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remplacement des valeurs manquantes"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" F | \n",
" E | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.391051 | \n",
" 5 | \n",
" 5.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
" 0.937188 | \n",
" 5 | \n",
" 1.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.121083 | \n",
" 1.633698 | \n",
" -0.134070 | \n",
" 5 | \n",
" 2.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" 1.247574 | \n",
" 0.102763 | \n",
" -0.254731 | \n",
" 5 | \n",
" 3.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D F E\n",
"2013-01-01 0.000000 0.000000 0.391051 5 5.0 1.0\n",
"2013-01-02 0.027502 0.242028 0.937188 5 1.0 1.0\n",
"2013-01-03 0.121083 1.633698 -0.134070 5 2.0 5.0\n",
"2013-01-04 1.247574 0.102763 -0.254731 5 3.0 5.0"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.fillna(value=5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Obtenir le masque de booléen de l'emplacement des nan"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" F | \n",
" E | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D F E\n",
"2013-01-01 False False False False True False\n",
"2013-01-02 False False False False False False\n",
"2013-01-03 False False False False False True\n",
"2013-01-04 False False False False False True"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.isnull(df1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Opérations"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Stats"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Les opérations excluent généralement les données manquantes."
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A 0.236427\n",
"B 0.689269\n",
"C 0.165605\n",
"D 5.000000\n",
"F 3.000000\n",
"dtype: float64\n",
"2013-01-01 1.347763\n",
"2013-01-02 1.441344\n",
"2013-01-03 1.724142\n",
"2013-01-04 1.819121\n",
"2013-01-05 2.251620\n",
"2013-01-06 1.995125\n",
"Freq: D, dtype: float64\n"
]
}
],
"source": [
"print(df.mean())\n",
"print(df.mean(1)) #Autre axe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Situation avec des objets de dimmension différentes. En plus, pandas va automatiquement étendre la donnée sur la dimension spécifiée"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2013-01-01 NaN\n",
"2013-01-02 NaN\n",
"2013-01-03 1.0\n",
"2013-01-04 3.0\n",
"2013-01-05 5.0\n",
"2013-01-06 NaN\n",
"Freq: D, dtype: float64\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" F | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" -0.878917 | \n",
" 0.633698 | \n",
" -1.134070 | \n",
" 4.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" -1.752426 | \n",
" -2.897237 | \n",
" -3.254731 | \n",
" 2.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" -4.394689 | \n",
" -3.660903 | \n",
" -4.686308 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 2013-01-06 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D F\n",
"2013-01-01 NaN NaN NaN NaN NaN\n",
"2013-01-02 NaN NaN NaN NaN NaN\n",
"2013-01-03 -0.878917 0.633698 -1.134070 4.0 1.0\n",
"2013-01-04 -1.752426 -2.897237 -3.254731 2.0 0.0\n",
"2013-01-05 -4.394689 -3.660903 -4.686308 0.0 -1.0\n",
"2013-01-06 NaN NaN NaN NaN NaN"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)\n",
"\n",
"print(s)\n",
"\n",
"df.sub(s, axis='index')"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on method sub in module pandas.core.ops:\n",
"\n",
"sub(other, axis='columns', level=None, fill_value=None) method of pandas.core.frame.DataFrame instance\n",
" Get Subtraction of dataframe and other, element-wise (binary operator `sub`).\n",
" \n",
" Equivalent to ``dataframe - other``, but with support to substitute a fill_value\n",
" for missing data in one of the inputs. With reverse version, `rsub`.\n",
" \n",
" Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to\n",
" arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`.\n",
" \n",
" Parameters\n",
" ----------\n",
" other : scalar, sequence, Series, or DataFrame\n",
" Any single or multiple element data structure, or list-like object.\n",
" axis : {0 or 'index', 1 or 'columns'}\n",
" Whether to compare by the index (0 or 'index') or columns\n",
" (1 or 'columns'). For Series input, axis to match Series index on.\n",
" level : int or label\n",
" Broadcast across a level, matching Index values on the\n",
" passed MultiIndex level.\n",
" fill_value : float or None, default None\n",
" Fill existing missing (NaN) values, and any new element needed for\n",
" successful DataFrame alignment, with this value before computation.\n",
" If data in both corresponding DataFrame locations is missing\n",
" the result will be missing.\n",
" \n",
" Returns\n",
" -------\n",
" DataFrame\n",
" Result of the arithmetic operation.\n",
" \n",
" See Also\n",
" --------\n",
" DataFrame.add : Add DataFrames.\n",
" DataFrame.sub : Subtract DataFrames.\n",
" DataFrame.mul : Multiply DataFrames.\n",
" DataFrame.div : Divide DataFrames (float division).\n",
" DataFrame.truediv : Divide DataFrames (float division).\n",
" DataFrame.floordiv : Divide DataFrames (integer division).\n",
" DataFrame.mod : Calculate modulo (remainder after division).\n",
" DataFrame.pow : Calculate exponential power.\n",
" \n",
" Notes\n",
" -----\n",
" Mismatched indices will be unioned together.\n",
" \n",
" Examples\n",
" --------\n",
" >>> df = pd.DataFrame({'angles': [0, 3, 4],\n",
" ... 'degrees': [360, 180, 360]},\n",
" ... index=['circle', 'triangle', 'rectangle'])\n",
" >>> df\n",
" angles degrees\n",
" circle 0 360\n",
" triangle 3 180\n",
" rectangle 4 360\n",
" \n",
" Add a scalar with operator version which return the same\n",
" results.\n",
" \n",
" >>> df + 1\n",
" angles degrees\n",
" circle 1 361\n",
" triangle 4 181\n",
" rectangle 5 361\n",
" \n",
" >>> df.add(1)\n",
" angles degrees\n",
" circle 1 361\n",
" triangle 4 181\n",
" rectangle 5 361\n",
" \n",
" Divide by constant with reverse version.\n",
" \n",
" >>> df.div(10)\n",
" angles degrees\n",
" circle 0.0 36.0\n",
" triangle 0.3 18.0\n",
" rectangle 0.4 36.0\n",
" \n",
" >>> df.rdiv(10)\n",
" angles degrees\n",
" circle inf 0.027778\n",
" triangle 3.333333 0.055556\n",
" rectangle 2.500000 0.027778\n",
" \n",
" Subtract a list and Series by axis with operator version.\n",
" \n",
" >>> df - [1, 2]\n",
" angles degrees\n",
" circle -1 358\n",
" triangle 2 178\n",
" rectangle 3 358\n",
" \n",
" >>> df.sub([1, 2], axis='columns')\n",
" angles degrees\n",
" circle -1 358\n",
" triangle 2 178\n",
" rectangle 3 358\n",
" \n",
" >>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),\n",
" ... axis='index')\n",
" angles degrees\n",
" circle -1 359\n",
" triangle 2 179\n",
" rectangle 3 359\n",
" \n",
" Multiply a DataFrame of different shape with operator version.\n",
" \n",
" >>> other = pd.DataFrame({'angles': [0, 3, 4]},\n",
" ... index=['circle', 'triangle', 'rectangle'])\n",
" >>> other\n",
" angles\n",
" circle 0\n",
" triangle 3\n",
" rectangle 4\n",
" \n",
" >>> df * other\n",
" angles degrees\n",
" circle 0 NaN\n",
" triangle 9 NaN\n",
" rectangle 16 NaN\n",
" \n",
" >>> df.mul(other, fill_value=0)\n",
" angles degrees\n",
" circle 0 0.0\n",
" triangle 9 0.0\n",
" rectangle 16 0.0\n",
" \n",
" Divide by a MultiIndex by level.\n",
" \n",
" >>> df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],\n",
" ... 'degrees': [360, 180, 360, 360, 540, 720]},\n",
" ... index=[['A', 'A', 'A', 'B', 'B', 'B'],\n",
" ... ['circle', 'triangle', 'rectangle',\n",
" ... 'square', 'pentagon', 'hexagon']])\n",
" >>> df_multindex\n",
" angles degrees\n",
" A circle 0 360\n",
" triangle 3 180\n",
" rectangle 4 360\n",
" B square 4 360\n",
" pentagon 5 540\n",
" hexagon 6 720\n",
" \n",
" >>> df.div(df_multindex, level=1, fill_value=0)\n",
" angles degrees\n",
" A circle NaN 1.0\n",
" triangle 1.0 1.0\n",
" rectangle 1.0 1.0\n",
" B square 0.0 0.0\n",
" pentagon 0.0 0.0\n",
" hexagon 0.0 0.0\n",
"\n"
]
}
],
"source": [
"help(df.sub)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" F | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" -0.878917 | \n",
" 0.633698 | \n",
" -1.134070 | \n",
" 4.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" -1.752426 | \n",
" -2.897237 | \n",
" -3.254731 | \n",
" 2.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" -4.394689 | \n",
" -3.660903 | \n",
" -4.686308 | \n",
" 0.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 2013-01-06 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D F\n",
"2013-01-01 NaN NaN NaN NaN NaN\n",
"2013-01-02 NaN NaN NaN NaN NaN\n",
"2013-01-03 -0.878917 0.633698 -1.134070 4.0 1.0\n",
"2013-01-04 -1.752426 -2.897237 -3.254731 2.0 0.0\n",
"2013-01-05 -4.394689 -3.660903 -4.686308 0.0 -1.0\n",
"2013-01-06 NaN NaN NaN NaN NaN"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sub(s, axis='index')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Apply"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Appliquer des foncitons aux données"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" F | \n",
"
\n",
" \n",
" \n",
" \n",
" 2013-01-01 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.391051 | \n",
" 5 | \n",
" NaN | \n",
"
\n",
" \n",
" 2013-01-02 | \n",
" 0.027502 | \n",
" 0.242028 | \n",
" 1.328239 | \n",
" 10 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2013-01-03 | \n",
" 0.148585 | \n",
" 1.875726 | \n",
" 1.194169 | \n",
" 15 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 2013-01-04 | \n",
" 1.396159 | \n",
" 1.978489 | \n",
" 0.939438 | \n",
" 20 | \n",
" 6.0 | \n",
"
\n",
" \n",
" 2013-01-05 | \n",
" 2.001470 | \n",
" 3.317585 | \n",
" 1.253129 | \n",
" 25 | \n",
" 10.0 | \n",
"
\n",
" \n",
" 2013-01-06 | \n",
" 1.418565 | \n",
" 4.135613 | \n",
" 0.993632 | \n",
" 30 | \n",
" 15.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D F\n",
"2013-01-01 0.000000 0.000000 0.391051 5 NaN\n",
"2013-01-02 0.027502 0.242028 1.328239 10 1.0\n",
"2013-01-03 0.148585 1.875726 1.194169 15 3.0\n",
"2013-01-04 1.396159 1.978489 0.939438 20 6.0\n",
"2013-01-05 2.001470 3.317585 1.253129 25 10.0\n",
"2013-01-06 1.418565 4.135613 0.993632 30 15.0"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.apply(np.cumsum)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"A 1.830479\n",
"B 1.633698\n",
"C 1.196686\n",
"D 0.000000\n",
"F 4.000000\n",
"dtype: float64"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.apply((lambda x: x.max() - x.min()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Histogramme"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 3\n",
"1 2\n",
"2 3\n",
"3 3\n",
"4 5\n",
"5 3\n",
"6 4\n",
"7 0\n",
"8 1\n",
"9 2\n",
"dtype: int64\n",
"3 4\n",
"2 2\n",
"0 1\n",
"1 1\n",
"4 1\n",
"5 1\n",
"dtype: int64\n"
]
}
],
"source": [
"s = pd.Series(np.random.randint(0, 7, size=10))\n",
"print(s)\n",
"print(s.value_counts())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Methodes String"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Les séries sont équipées de méthodes pour traiter les strings avec l'attribut str qui rend facile la manipulation de chaque élémen d'un tableau. On utilise régulièrement des expressions régulières."
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 a\n",
"1 b\n",
"2 c\n",
"3 aaba\n",
"4 baca\n",
"5 NaN\n",
"6 caba\n",
"7 dog\n",
"8 cat\n",
"dtype: object"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])\n",
"s.str.lower()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Regrouper"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Concaténation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Pandas fournit des methodes pour facilement combiner des Series, DatFrame et des Panel objets avec des types variés de set logique pour les indexes et des fonctionnalités d'algèbre dans le cas de jointure / regroupement"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On peut concaténer des objets pandas avec concat()"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0 1 2 3\n",
"0 0.069013 -0.040383 -0.607478 -1.286906\n",
"1 1.270398 -0.523822 0.858627 0.751044\n",
"2 1.480067 -0.005535 0.244562 -0.224817\n",
"3 -0.730252 0.188720 0.105386 -0.600060\n",
"4 -0.844170 1.736848 0.190962 0.451052\n",
"5 0.181550 0.136675 0.892205 0.701368\n",
"6 -0.485632 -0.894081 0.532846 -1.228718\n",
"7 0.324976 -1.670890 -0.082170 -0.383374\n",
"8 0.589956 -0.859584 0.737789 1.193208\n",
"9 1.124989 -1.301511 2.010070 0.593592\n"
]
}
],
"source": [
"df = pd.DataFrame(np.random.randn(10, 4))\n",
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pieces = [df[:3], df[3:7], df[7:]]"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.069013 | \n",
" -0.040383 | \n",
" -0.607478 | \n",
" -1.286906 | \n",
"
\n",
" \n",
" 1 | \n",
" 1.270398 | \n",
" -0.523822 | \n",
" 0.858627 | \n",
" 0.751044 | \n",
"
\n",
" \n",
" 2 | \n",
" 1.480067 | \n",
" -0.005535 | \n",
" 0.244562 | \n",
" -0.224817 | \n",
"
\n",
" \n",
" 3 | \n",
" -0.730252 | \n",
" 0.188720 | \n",
" 0.105386 | \n",
" -0.600060 | \n",
"
\n",
" \n",
" 4 | \n",
" -0.844170 | \n",
" 1.736848 | \n",
" 0.190962 | \n",
" 0.451052 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.181550 | \n",
" 0.136675 | \n",
" 0.892205 | \n",
" 0.701368 | \n",
"
\n",
" \n",
" 6 | \n",
" -0.485632 | \n",
" -0.894081 | \n",
" 0.532846 | \n",
" -1.228718 | \n",
"
\n",
" \n",
" 7 | \n",
" 0.324976 | \n",
" -1.670890 | \n",
" -0.082170 | \n",
" -0.383374 | \n",
"
\n",
" \n",
" 8 | \n",
" 0.589956 | \n",
" -0.859584 | \n",
" 0.737789 | \n",
" 1.193208 | \n",
"
\n",
" \n",
" 9 | \n",
" 1.124989 | \n",
" -1.301511 | \n",
" 2.010070 | \n",
" 0.593592 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"0 0.069013 -0.040383 -0.607478 -1.286906\n",
"1 1.270398 -0.523822 0.858627 0.751044\n",
"2 1.480067 -0.005535 0.244562 -0.224817\n",
"3 -0.730252 0.188720 0.105386 -0.600060\n",
"4 -0.844170 1.736848 0.190962 0.451052\n",
"5 0.181550 0.136675 0.892205 0.701368\n",
"6 -0.485632 -0.894081 0.532846 -1.228718\n",
"7 0.324976 -1.670890 -0.082170 -0.383374\n",
"8 0.589956 -0.859584 0.737789 1.193208\n",
"9 1.124989 -1.301511 2.010070 0.593592"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(pieces)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Jointures"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On peut merger à la manière de requete SQL."
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" key lval\n",
"0 foo 1\n",
"1 foo 2\n",
" key rval\n",
"0 foo 4\n",
"1 foo 5\n",
" key lval rval\n",
"0 foo 1 4\n",
"1 foo 1 5\n",
"2 foo 2 4\n",
"3 foo 2 5\n"
]
}
],
"source": [
"left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})\n",
"right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})\n",
"\n",
"print(left)\n",
"print(right)\n",
"\n",
"print(pd.merge(left, right, on='key'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Append"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -1.080456 | \n",
" 0.981786 | \n",
" 0.880026 | \n",
" 1.144814 | \n",
"
\n",
" \n",
" 1 | \n",
" -1.665425 | \n",
" -0.577014 | \n",
" 1.265051 | \n",
" -0.515127 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.342626 | \n",
" -0.635112 | \n",
" 1.558737 | \n",
" 1.070910 | \n",
"
\n",
" \n",
" 3 | \n",
" -0.236688 | \n",
" -1.178135 | \n",
" 0.371970 | \n",
" 0.268590 | \n",
"
\n",
" \n",
" 4 | \n",
" -0.478343 | \n",
" 0.015331 | \n",
" 0.835794 | \n",
" 0.553637 | \n",
"
\n",
" \n",
" 5 | \n",
" -1.115995 | \n",
" -0.086228 | \n",
" -0.959499 | \n",
" -1.168225 | \n",
"
\n",
" \n",
" 6 | \n",
" 0.615278 | \n",
" -0.550461 | \n",
" -0.328022 | \n",
" 0.753208 | \n",
"
\n",
" \n",
" 7 | \n",
" -0.308374 | \n",
" 1.024291 | \n",
" -1.437537 | \n",
" 1.189244 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D\n",
"0 -1.080456 0.981786 0.880026 1.144814\n",
"1 -1.665425 -0.577014 1.265051 -0.515127\n",
"2 0.342626 -0.635112 1.558737 1.070910\n",
"3 -0.236688 -1.178135 0.371970 0.268590\n",
"4 -0.478343 0.015331 0.835794 0.553637\n",
"5 -1.115995 -0.086228 -0.959499 -1.168225\n",
"6 0.615278 -0.550461 -0.328022 0.753208\n",
"7 -0.308374 1.024291 -1.437537 1.189244"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -1.080456 | \n",
" 0.981786 | \n",
" 0.880026 | \n",
" 1.144814 | \n",
"
\n",
" \n",
" 1 | \n",
" -1.665425 | \n",
" -0.577014 | \n",
" 1.265051 | \n",
" -0.515127 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.342626 | \n",
" -0.635112 | \n",
" 1.558737 | \n",
" 1.070910 | \n",
"
\n",
" \n",
" 3 | \n",
" -0.236688 | \n",
" -1.178135 | \n",
" 0.371970 | \n",
" 0.268590 | \n",
"
\n",
" \n",
" 4 | \n",
" -0.478343 | \n",
" 0.015331 | \n",
" 0.835794 | \n",
" 0.553637 | \n",
"
\n",
" \n",
" 5 | \n",
" -1.115995 | \n",
" -0.086228 | \n",
" -0.959499 | \n",
" -1.168225 | \n",
"
\n",
" \n",
" 6 | \n",
" 0.615278 | \n",
" -0.550461 | \n",
" -0.328022 | \n",
" 0.753208 | \n",
"
\n",
" \n",
" 7 | \n",
" -0.308374 | \n",
" 1.024291 | \n",
" -1.437537 | \n",
" 1.189244 | \n",
"
\n",
" \n",
" 8 | \n",
" -0.236688 | \n",
" -1.178135 | \n",
" 0.371970 | \n",
" 0.268590 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D\n",
"0 -1.080456 0.981786 0.880026 1.144814\n",
"1 -1.665425 -0.577014 1.265051 -0.515127\n",
"2 0.342626 -0.635112 1.558737 1.070910\n",
"3 -0.236688 -1.178135 0.371970 0.268590\n",
"4 -0.478343 0.015331 0.835794 0.553637\n",
"5 -1.115995 -0.086228 -0.959499 -1.168225\n",
"6 0.615278 -0.550461 -0.328022 0.753208\n",
"7 -0.308374 1.024291 -1.437537 1.189244\n",
"8 -0.236688 -1.178135 0.371970 0.268590"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s = df.iloc[3]\n",
"df.append(s, ignore_index=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Groupement"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Le regroupement comprend les étapes suivantes :\n",
" * Séparation de la donnée en groupes\n",
" * Appliquer une fonction a chaque group indépendamment\n",
" * Combiner les resultats dans une structure de données"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',\n",
"'foo', 'bar', 'foo', 'foo'],\n",
"'B' : ['one', 'one', 'two', 'three',\n",
"'two', 'two', 'one', 'three'],\n",
"'C' : np.random.randn(8),\n",
"'D' : np.random.randn(8)})"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" foo | \n",
" one | \n",
" 0.438443 | \n",
" 0.479237 | \n",
"
\n",
" \n",
" 1 | \n",
" bar | \n",
" one | \n",
" -0.457745 | \n",
" -0.363322 | \n",
"
\n",
" \n",
" 2 | \n",
" foo | \n",
" two | \n",
" 0.684435 | \n",
" -0.421288 | \n",
"
\n",
" \n",
" 3 | \n",
" bar | \n",
" three | \n",
" -0.620241 | \n",
" 0.267481 | \n",
"
\n",
" \n",
" 4 | \n",
" foo | \n",
" two | \n",
" -0.032591 | \n",
" 2.801863 | \n",
"
\n",
" \n",
" 5 | \n",
" bar | \n",
" two | \n",
" 0.878103 | \n",
" -0.064529 | \n",
"
\n",
" \n",
" 6 | \n",
" foo | \n",
" one | \n",
" -0.000896 | \n",
" -1.695125 | \n",
"
\n",
" \n",
" 7 | \n",
" foo | \n",
" three | \n",
" 1.341618 | \n",
" -0.165415 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D\n",
"0 foo one 0.438443 0.479237\n",
"1 bar one -0.457745 -0.363322\n",
"2 foo two 0.684435 -0.421288\n",
"3 bar three -0.620241 0.267481\n",
"4 foo two -0.032591 2.801863\n",
"5 bar two 0.878103 -0.064529\n",
"6 foo one -0.000896 -1.695125\n",
"7 foo three 1.341618 -0.165415"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Groupement et somme des groupes"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" A | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" bar | \n",
" -0.199883 | \n",
" -0.160370 | \n",
"
\n",
" \n",
" foo | \n",
" 2.431011 | \n",
" 0.999271 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" C D\n",
"A \n",
"bar -0.199883 -0.160370\n",
"foo 2.431011 0.999271"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('A').sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Groupement de multiple colonnes"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" C | \n",
" D | \n",
"
\n",
" \n",
" A | \n",
" B | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" bar | \n",
" one | \n",
" -0.457745 | \n",
" -0.363322 | \n",
"
\n",
" \n",
" three | \n",
" -0.620241 | \n",
" 0.267481 | \n",
"
\n",
" \n",
" two | \n",
" 0.878103 | \n",
" -0.064529 | \n",
"
\n",
" \n",
" foo | \n",
" one | \n",
" 0.437547 | \n",
" -1.215888 | \n",
"
\n",
" \n",
" three | \n",
" 1.341618 | \n",
" -0.165415 | \n",
"
\n",
" \n",
" two | \n",
" 0.651845 | \n",
" 2.380574 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" C D\n",
"A B \n",
"bar one -0.457745 -0.363322\n",
" three -0.620241 0.267481\n",
" two 0.878103 -0.064529\n",
"foo one 0.437547 -1.215888\n",
" three 1.341618 -0.165415\n",
" two 0.651845 2.380574"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(['A','B']).sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Reformation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Stack"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" first | \n",
" second | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" bar | \n",
" one | \n",
" -0.347244 | \n",
" -0.780456 | \n",
"
\n",
" \n",
" two | \n",
" -0.394013 | \n",
" -0.561465 | \n",
"
\n",
" \n",
" baz | \n",
" one | \n",
" 1.055322 | \n",
" 2.620028 | \n",
"
\n",
" \n",
" two | \n",
" -1.443417 | \n",
" 0.514155 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"first second \n",
"bar one -0.347244 -0.780456\n",
" two -0.394013 -0.561465\n",
"baz one 1.055322 2.620028\n",
" two -1.443417 0.514155"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',\n",
"'foo', 'foo', 'qux', 'qux'],\n",
"['one', 'two', 'one', 'two',\n",
"'one', 'two', 'one', 'two']]))\n",
"\n",
"index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])\n",
"df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])\n",
"df2 = df[:4]\n",
"df2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"La méthode stack() compresses un level dans les colonnes du dataframe\n"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"first second \n",
"bar one A -0.347244\n",
" B -0.780456\n",
" two A -0.394013\n",
" B -0.561465\n",
"baz one A 1.055322\n",
" B 2.620028\n",
" two A -1.443417\n",
" B 0.514155\n",
"dtype: float64"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stacked = df2.stack()\n",
"stacked"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Avec une 'stacked' dataframe ou série, l'opération inverse est unstack()"
]
}
],
"metadata": {
"anaconda-cloud": {},
"interpreter": {
"hash": "40d3a090f54c6569ab1632332b64b2c03c39dcf918b08424e98f38b5ae0af88f"
},
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 1
}