Skip to content

Write Idiomatic Pandas Code

Foreword

Code snippets and excerpts from the tutorial. Python 3. From DataCamp.


Load and Explore the Data

The Data

The datasets are downloadable from Kaggle.

%pylab inline
import pandas as pd

# Import Times Higher Education World University Rankings data
# https://www.timeshighereducation.com/world-university-rankings
times_df = pd.read_csv('timesData.csv', thousands=",")

# Import Academic Ranking of World Universities data
# http://www.shanghairanking.com/
shanghai_df = pd.read_csv('shanghaiData.csv')
1
Populating the interactive namespace from numpy and matplotlib

Quickly Inspecting the Data

# Return the first rows of `times_df`
times_df.head()
world_rank university_name country teaching international research citations income total_score num_students student_staff_ratio international_students female_male_ratio year
0 1 Harvard University United States of America 99.7 72.4 98.7 98.8 34.5 96.1 20152.0 8.9 25% NaN 2011
1 2 California Institute of Technology United States of America 97.7 54.6 98.0 99.9 83.7 96.0 2243.0 6.9 27% 33 : 67 2011
2 3 Massachusetts Institute of Technology United States of America 97.8 82.3 91.4 99.9 87.5 95.6 11074.0 9.0 33% 37 : 63 2011
3 4 Stanford University United States of America 98.3 29.5 98.1 99.2 64.3 94.3 15596.0 7.8 22% 42 : 58 2011
4 5 Princeton University United States of America 90.9 70.3 95.4 99.9 - 94.2 7929.0 8.4 27% 45 : 55 2011
# Describe `times_df`
times_df.describe()
teaching research citations num_students student_staff_ratio year
count 2603.000000 2603.000000 2603.000000 2544.000000 2544.000000 2603.000000
mean 37.801498 35.910257 60.921629 23873.758648 18.445283 2014.075682
std 17.604218 21.254805 23.073219 17675.946877 11.458698 1.685733
min 9.900000 2.900000 1.200000 462.000000 0.600000 2011.000000
25% 24.700000 19.600000 45.500000 12637.750000 11.975000 2013.000000
50% 33.900000 30.500000 62.500000 20851.000000 16.100000 2014.000000
75% 46.400000 47.250000 79.050000 29991.000000 21.500000 2016.000000
max 99.700000 99.400000 100.000000 379231.000000 162.600000 2016.000000
# Return the first rows of `shanghai_df`
shanghai_df.head()
world_rank university_name national_rank total_score alumni award hici ns pub pcp year
0 1 Harvard University 1 100.0 100.0 100.0 100.0 100.0 100.0 72.4 2005
1 2 University of Cambridge 1 73.6 99.8 93.4 53.3 56.6 70.9 66.9 2005
2 3 Stanford University 2 73.4 41.1 72.2 88.5 70.9 72.3 65.0 2005
3 4 University of California, Berkeley 3 72.8 71.8 76.0 69.4 73.9 72.2 52.7 2005
4 5 Massachusetts Institute of Technology (MIT) 4 70.1 74.0 80.6 66.7 65.8 64.3 53.0 2005
# Describe `shanghai_df`
shanghai_df.describe()
total_score alumni award hici ns pub pcp year
count 1101.000000 4896.000000 4895.00000 4895.000000 4875.000000 4895.000000 4895.000000 4897.000000
mean 36.383470 9.161724 7.69191 16.221491 16.078503 38.254648 21.242329 2009.658566
std 13.557186 14.140636 15.49411 14.382710 12.511529 13.050809 9.254351 3.197576
min 23.500000 0.000000 0.00000 0.000000 0.000000 7.300000 8.300000 2005.000000
25% 27.400000 0.000000 0.00000 7.300000 8.000000 28.900000 15.600000 2007.000000
50% 31.300000 0.000000 0.00000 12.600000 12.800000 36.000000 19.000000 2009.000000
75% 41.800000 15.600000 13.40000 21.700000 19.800000 45.300000 24.500000 2012.000000
max 100.000000 100.000000 100.00000 100.000000 100.000000 100.000000 100.000000 2015.000000

Index and Pull Values

[] for row numbers or column names, loc[], iloc[], query.

# Retrieve the total score of the first row
print(times_df.loc[0, 'total_score'])
1
96.1
# Retrieve rows 0 and 1
print(times_df[0:2])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
  world_rank                     university_name                   country  \
0          1                  Harvard University  United States of America   
1          2  California Institute of Technology  United States of America

   teaching international  research  citations income total_score  \
0      99.7          72.4      98.7       98.8   34.5        96.1   
1      97.7          54.6      98.0       99.9   83.7        96.0

   num_students  student_staff_ratio international_students female_male_ratio  \
0       20152.0                  8.9                    25%               NaN   
1        2243.0                  6.9                    27%           33 : 67

   year  
0  2011  
1  2011
# Retrieve the values at columns and rows 1-3
print(times_df.iloc[1:4,1:4])
1
2
3
4
                         university_name                   country  teaching
1     California Institute of Technology  United States of America      97.7
2  Massachusetts Institute of Technology  United States of America      97.8
3                    Stanford University  United States of America      98.3
# Retrieve the column `total_score` 
print(times_df['total_score'])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
0       96.1
1       96.0
2       95.6
3       94.3
4       94.2
5       91.2
6       91.2
7       91.1
8       90.6
9       89.5
10      87.7
11      86.9
12      86.4
13      83.9
14      83.4
15      83.4
16      82.0
17      81.0
18      79.5
19      79.3
20      79.2
21      78.4
22      78.0
23      76.5
24      75.9
25      75.6
26      75.3
27      75.1
28      75.0
29      73.8
        ... 
2573       -
2574       -
2575       -
2576       -
2577       -
2578       -
2579       -
2580       -
2581       -
2582       -
2583       -
2584       -
2585       -
2586       -
2587       -
2588       -
2589       -
2590       -
2591       -
2592       -
2593       -
2594       -
2595       -
2596       -
2597       -
2598       -
2599       -
2600       -
2601       -
2602       -
Name: total_score, Length: 2603, dtype: object
# Are the last entries after 2006?
print(shanghai_df.loc[:-10, 'year'] > 2006)
1
Series([], Name: year, dtype: bool)
# Was the alumni count higher than 90 for the first ten universities?
print(shanghai_df.loc[0:11, 'alumni'] > 90)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
0      True
1      True
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
Name: alumni, dtype: bool
# Query `shanghai_df` for universities with total score between 40 and 50
average_schools = shanghai_df.query('total_score > 0 and total_score < 50')

# Print the result
print(average_schools)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
     world_rank                                    university_name  \
15           16                  University of Wisconsin - Madison   
16           17                           University of Washington   
17           18            University of California, San Francisco   
18           19                       The Johns Hopkins University   
19           20                            The University of Tokyo   
20           21                 University of Michigan - Ann Arbor   
21           22                                   Kyoto University   
22           23  The Imperial College of Science, Technology an...   
23           24                              University of Toronto   
24           25         University of Illinois at Urbana-Champaign   
25           26                          University College London   
26           27       Swiss Federal Institute of Technology Zurich   
27           28                 Washington University in St. Louis   
28           29                                New York University   
29           30                             Rockefeller University   
30           31                            Northwestern University   
31           32                                    Duke University   
32           32               University of Minnesota, Twin Cities   
33           34            University of California, Santa Barbara   
34           35                  University of Colorado at Boulder   
35           36                  The University of Texas at Austin   
36           37                     University of British Columbia   
37           38  The University of Texas Southwestern Medical C...   
38           39    Pennsylvania State University - University Park   
39           39                              Vanderbilt University   
40           41                    University of California, Davis   
41           41                                 Utrecht University   
42           43  Rutgers, The State University of New Jersey - ...   
43           43                           University of Pittsburgh   
44           45                               Karolinska Institute   
...         ...                                                ...   
4467         71                                   Ghent University   
4468         72                   Ecole Normale Superieure - Paris   
4469         73                                  Aarhus University   
4470         73                                  Boston University   
4471         75                                   Brown University   
4472         75                            University of Groningen   
4473         77                                  Nagoya University   
4474         77                               Stockholm University   
4475         77            Technion-Israel Institute of Technology   
4476         77                 The Australian National University   
4477         77                       The University of Queensland   
4478         82                                  Leiden University   
4479         83                              University of Florida   
4480         84                                    Rice University   
4481         85                                   Osaka University   
4482         86                            Moscow State University   
4483         87                The University of Western Australia   
4484         87                                University of Basel   
4485         87                           University of Strasbourg   
4486         90                                          KU Leuven   
4487         90                              University of Arizona   
4488         92                              University of Warwick   
4489         93                           Arizona State University   
4490         93               University of California, Santa Cruz   
4491         93                                 University of Utah   
4492         96                                McMaster University   
4493         97                                 University of Bonn   
4494         98                            VU University Amsterdam   
4495         99                          Michigan State University   
4496        100                               Texas A&M University

     national_rank  total_score  alumni  award  hici    ns   pub   pcp  year  
15              14         49.2    43.0   36.3  52.1  46.3  68.7  29.0  2005  
16              15         48.4    28.8   32.4  53.9  47.1  73.8  27.2  2005  
17              16         47.8     0.0   37.6  55.6  57.9  58.8  45.2  2005  
18              17         46.9    51.4   28.3  41.6  52.2  67.7  24.9  2005  
19               1         46.7    36.0   14.4  38.5  52.1  86.5  34.7  2005  
20              18         44.9    43.0    0.0  61.9  43.0  76.5  30.9  2005  
21               2         43.8    39.7   34.1  34.2  37.0  72.3  31.1  2005  
22               3         43.7    20.8   38.1  40.8  38.2  64.6  40.3  2005  
23               1         43.1    28.1   19.7  39.3  38.9  76.7  41.9  2005  
24              19         42.8    41.6   37.4  44.4  34.1  58.0  26.0  2005  
25               4         42.6    30.7   32.9  37.7  41.5  60.5  38.8  2005  
26               1         41.7    40.2   37.0  35.1  41.1  43.4  52.4  2005  
27              20         40.7    25.1   26.6  38.5  46.5  53.9  39.9  2005  
28              21         38.8    33.8   25.0  43.0  35.3  55.4  26.3  2005  
29              22         38.2    22.6   59.8  28.3  44.1  24.0  35.9  2005  
30              23         37.9    21.7   19.3  44.4  33.8  57.6  36.2  2005  
31              24         37.7    20.8    0.0  47.1  45.3  60.8  38.9  2005  
32              24         37.7    36.0    0.0  49.7  35.2  68.4  23.8  2005  
33              26         36.9     0.0   36.0  42.3  39.0  44.1  35.8  2005  
34              27         36.3    16.6   29.8  40.8  36.6  46.3  29.5  2005  
35              28         35.5    21.7   17.1  49.1  30.0  54.8  21.7  2005  
36               2         35.4    20.8   19.3  32.4  32.5  60.4  33.9  2005  
37              29         34.8    24.3   33.9  31.4  38.2  37.9  31.0  2005  
38              30         33.4    14.0    0.0  45.8  37.9  59.9  24.0  2005  
39              30         33.4    12.5   30.2  34.2  24.5  49.2  35.6  2005  
40              32         32.9     0.0    0.0  46.5  34.5  64.0  29.8  2005  
41               1         32.9    30.7   21.4  27.2  27.3  55.7  25.9  2005  
42              33         32.3    15.4   20.4  36.9  32.9  47.1  24.1  2005  
43              33         32.3    25.1    0.0  40.1  25.9  64.3  28.2  2005  
44               1         32.1    30.7   27.8  33.3  19.7  47.3  25.1  2005  
...            ...          ...     ...    ...   ...   ...   ...   ...   ...  
4467             1         27.8     5.1   13.3  26.9  18.3  57.5  34.5  2015  
4468             3         27.6    48.9   28.0   6.2  19.5  26.4  60.5  2015  
4469             2         27.3    11.5   22.1  12.3  25.8  51.8  31.0  2015  
4470            42         27.3    11.5   11.5  29.1  26.3  49.5  21.8  2015  
4471            43         27.0    14.5   13.3  26.2  25.1  43.6  32.8  2015  
4472             2         27.0     0.0   18.8  23.0  21.0  52.6  32.7  2015  
4473             3         26.7    29.0   25.3  16.8  17.7  44.3  23.1  2015  
4474             3         26.7    24.1   27.4  18.1  19.4  40.4  25.6  2015  
4475             2         26.7    23.5   37.6  15.2  18.3  33.3  28.2  2015  
4476             2         26.7    13.6   19.2  24.8  20.1  45.1  29.1  2015  
4477             2         26.7    12.6    0.0  22.0  24.0  63.2  29.3  2015  
4478             3         26.5    17.8    9.4  25.7  22.0  46.9  32.9  2015  
4479            44         26.2    17.0    0.0  30.9  21.0  58.7  17.3  2015  
4480            45         26.0    16.2   21.7  28.1  22.8  29.2  34.3  2015  
4481             4         25.7     8.9    0.0  31.6  26.7  51.7  21.7  2015  
4482             1         25.3    41.4   33.0   0.0   7.7  46.4  31.3  2015  
4483             4         24.9    13.6   14.1  24.1  14.5  47.5  28.9  2015  
4484             4         24.9    19.2   16.3  17.4  21.4  39.2  35.0  2015  
4485             4         24.9    25.1   28.8  15.9  19.0  34.6  21.7  2015  
4486             2         24.7     0.0    0.0  30.1  18.6  56.6  30.4  2015  
4487            46         24.7    14.5    0.0  27.2  27.5  48.4  20.3  2015  
4488             9         24.6     0.0   29.8  23.7  14.0  39.3  26.8  2015  
4489            47         24.5     0.0   20.0  22.2  25.5  42.6  19.1  2015  
4490            47         24.5     0.0    0.0  37.9  33.9  29.0  37.6  2015  
4491            47         24.5     0.0   11.5  26.5  25.5  46.7  18.7  2015  
4492             4         24.4    12.6   18.8  23.2  15.1  44.5  22.5  2015  
4493             4         24.3    15.4   19.8  17.4  21.1  39.8  25.9  2015  
4494             4         24.2     0.0    0.0  27.8  18.0  55.5  33.3  2015  
4495            50         24.0     8.9    0.0  30.7  21.8  50.6  18.9  2015  
4496            51         23.9     0.0    0.0  34.3  22.7  49.5  20.9  2015

[948 rows x 11 columns]
shanghai_df.query("national_rank == 1 and world_rank == 1")
world_rank university_name national_rank total_score alumni award hici ns pub pcp year
shanghai_df.query("alumni < 20")
world_rank university_name national_rank total_score alumni award hici ns pub pcp year
12 13 University of California, San Diego 11 51.0 17.7 34.7 59.8 56.5 64.5 46.6 2005
17 18 University of California, San Francisco 16 47.8 0.0 37.6 55.6 57.9 58.8 45.2 2005
33 34 University of California, Santa Barbara 26 36.9 0.0 36.0 42.3 39.0 44.1 35.8 2005
34 35 University of Colorado at Boulder 27 36.3 16.6 29.8 40.8 36.6 46.3 29.5 2005
38 39 Pennsylvania State University - University Park 30 33.4 14.0 0.0 45.8 37.9 59.9 24.0 2005
39 39 Vanderbilt University 30 33.4 12.5 30.2 34.2 24.5 49.2 35.6 2005
40 41 University of California, Davis 32 32.9 0.0 0.0 46.5 34.5 64.0 29.8 2005
42 43 Rutgers, The State University of New Jersey - ... 33 32.3 15.4 20.4 36.9 32.9 47.1 24.1 2005
47 47 University of California, Irvine 35 31.8 0.0 30.0 32.4 28.5 48.2 31.1 2005
49 50 University of Southern California 37 31.7 0.0 27.3 37.7 23.6 52.8 25.8 2005
54 55 University of North Carolina at Chapel Hill 39 30.3 12.5 0.0 35.1 32.8 59.5 27.3 2005
55 56 The Australian National University 1 30.2 17.7 12.9 36.9 29.0 45.1 27.8 2005
57 57 University of Florida 40 30.0 15.4 0.0 35.1 25.0 65.2 25.8 2005
58 57 University of Zurich 2 30.0 12.5 27.3 19.2 30.3 47.2 30.6 2005
61 62 Osaka University 3 29.3 12.5 0.0 23.6 31.1 66.8 29.2 2005
62 63 The Ohio State University - Columbus 41 29.2 17.7 0.0 40.8 21.5 61.2 19.5 2005
63 64 University of Bristol 7 28.8 10.9 18.2 30.4 24.5 47.5 27.4 2005
70 71 University of Heidelberg 3 28.0 10.9 27.7 20.8 20.9 48.1 26.9 2005
72 73 Tohoku University 4 27.8 18.8 0.0 19.2 26.9 65.3 29.0 2005
73 73 University of Arizona 44 27.8 0.0 0.0 29.4 36.8 55.8 25.7 2005
74 75 Purdue University - West Lafayette 45 27.7 18.8 17.1 27.2 21.4 49.8 19.4 2005
75 76 University of Helsinki 1 27.4 18.8 18.2 15.7 21.4 54.5 27.5 2005
76 77 Michigan State University 46 26.9 12.5 0.0 37.7 26.6 51.0 18.7 2005
79 80 Boston University 48 26.1 15.4 0.0 31.4 28.1 50.8 17.5 2005
80 80 King's College London 9 26.1 16.6 23.5 20.8 17.4 44.6 24.8 2005
81 82 University of Melbourne 2 26.0 15.4 14.4 22.2 18.7 53.5 19.9 2005
82 83 University of Nottingham 10 25.9 15.4 20.4 20.8 19.0 45.6 24.8 2005
85 86 Brown University 49 25.4 0.0 13.9 29.4 25.5 40.7 27.9 2005
86 87 Indiana University Bloomington 50 25.2 14.0 18.2 24.8 21.2 42.0 18.2 2005
88 89 Texas A&M University - College Station 51 25.1 0.0 0.0 32.4 24.4 55.0 20.4 2005
... ... ... ... ... ... ... ... ... ... ... ...
4867 401-500 University of Jena 29-39 NaN 0.0 0.0 0.0 9.3 34.0 17.1 2015
4868 401-500 University of Jyvaskyla 4-6 NaN 0.0 0.0 3.6 10.3 26.7 14.1 2015
4869 401-500 University of Konstanz 29-39 NaN 0.0 0.0 12.1 11.4 22.2 13.5 2015
4870 401-500 University of KwaZulu-Natal 3-4 NaN 0.0 0.0 3.6 8.4 32.8 16.6 2015
4871 401-500 University of Ljubljana 1 NaN 0.0 0.0 0.0 7.7 35.1 14.2 2015
4872 401-500 University of Maryland, Baltimore County 126-146 NaN 0.0 0.0 17.4 6.5 17.8 17.3 2015
4873 401-500 University of Milan - Bicocca 11-20 NaN 0.0 0.0 5.0 5.6 30.9 21.4 2015
4874 401-500 University of Nice Sophia Antipolis 19-22 NaN 0.0 0.0 3.6 16.3 26.2 12.7 2015
4875 401-500 University of Oklahoma - Norman 126-146 NaN 0.0 0.0 5.1 10.0 28.0 14.0 2015
4876 401-500 University of Palermo 11-20 NaN 0.0 0.0 6.3 6.6 28.2 14.8 2015
4877 401-500 University of Parma 11-20 NaN 0.0 0.0 13.6 2.1 26.7 19.6 2015
4878 401-500 University of Pavia 11-20 NaN 0.0 0.0 5.0 5.7 30.7 19.7 2015
4879 401-500 University of Perugia 11-20 NaN 0.0 0.0 3.6 7.5 29.3 18.5 2015
4880 401-500 University of Quebec 19-20 NaN 0.0 0.0 0.0 9.8 33.3 16.8 2015
4881 401-500 University of Regensburg 29-39 NaN 0.0 0.0 3.6 13.9 27.7 15.2 2015
4882 401-500 University of Rennes 1 19-22 NaN 0.0 0.0 3.6 9.2 28.1 11.2 2015
4883 401-500 University of Rhode Island 126-146 NaN 0.0 0.0 15.2 6.1 21.1 16.0 2015
4884 401-500 University of Roma - Tor Vergata 11-20 NaN 0.0 0.0 0.0 8.8 33.7 19.2 2015
4885 401-500 University of Rostock 29-39 NaN 0.0 0.0 8.6 8.4 25.0 13.5 2015
4886 401-500 University of Santiago Compostela 9-13 NaN 0.0 0.0 7.1 6.1 31.1 13.2 2015
4887 401-500 University of Science, Malaysia 2 NaN 0.0 0.0 7.1 3.3 30.6 15.7 2015
4888 401-500 University of Seville 9-13 NaN 0.0 0.0 0.0 7.5 33.7 11.3 2015
4889 401-500 University of Surrey 34-37 NaN 0.0 0.0 8.6 4.9 27.0 18.0 2015
4890 401-500 University of Szeged 1-2 NaN 0.0 13.3 3.6 3.4 21.8 12.8 2015
4891 401-500 University of the Basque Country 9-13 NaN 0.0 0.0 3.6 7.1 36.1 13.5 2015
4892 401-500 University of Trieste 11-20 NaN 0.0 0.0 5.0 10.9 25.1 20.1 2015
4893 401-500 University of Zaragoza 9-13 NaN 0.0 0.0 7.6 5.1 33.3 13.1 2015
4894 401-500 Utah State University 126-146 NaN 13.6 0.0 3.6 10.8 25.1 15.5 2015
4895 401-500 Vienna University of Technology 4-6 NaN 0.0 0.0 0.0 12.2 28.8 22.9 2015
4896 401-500 Wake Forest University 126-146 NaN 0.0 0.0 14.9 7.5 25.0 11.9 2015

4122 rows × 11 columns

Method Chaining

pipe() to chain operations and thus eliminate the need for intermediate DataFrames.

Without this operator, instead of writing df.pipe(f).pipe(g).pipe(h) write: h(g(f(df))). This becomes harder to follow once the number of nested functions grows large.

# Extract info
def extract_info(input_df, name):
    df = input_df.copy()
    info_df = pd.DataFrame({'nb_rows': df.shape[0], 'nb_cols': df.shape[1], 'name': name}, index=range(1))
    return info_df
# Gather all info   
all_info = pd.concat([times_df.pipe(extract_info, 'times'), shanghai_df.pipe(extract_info, 'shanghai')])

print(all_info)
1
2
3
       name  nb_cols  nb_rows
0     times       14     2603
0  shanghai       11     4897

Select the common columns.

common_columns = set(shanghai_df.columns) & set(times_df.columns)

# Return `common_columns`
print(common_columns)
1
{'year', 'university_name', 'total_score', 'world_rank'}

Clean the Data

# Clean up the `world_rank` 
def clean_world_rank(input_df):
    df = input_df.copy()
    df.world_rank = df.world_rank.str.split('-').str[0].str.split('=').str[0]
    return df
# Assign the common years of `shanghai_df` and `times_df` to `common_years`    
common_years = set(shanghai_df.year) & set(times_df.year) 

# Print `common_years`
print(common_years)
1
{2011, 2012, 2013, 2014, 2015}
# Filter years
def filter_year(input_df, years):
    df = input_df.copy()
    return df.query('year in {}'.format(list(years)))
# Clean `times_df` and `shanghai_df`
cleaned_times_df = (times_df.loc[:, common_columns]
                            .pipe(filter_year, common_years)
                            .pipe(clean_world_rank)
                            .assign(name='times'))
cleaned_shanghai_df = (shanghai_df.loc[:, common_columns]
                                  .pipe(filter_year, common_years)
                                  .pipe(clean_world_rank)
                                  .assign(name='shanghai'))

Concatenate into a single DataFrame

38% of data missing from the total_score column: drop this column with the .drop method.

# Compose `ranking_df` with `cleaned_times_df` and `cleaned_shanghai_df`
ranking_df = pd.concat([cleaned_times_df, cleaned_shanghai_df])

# Calculate the percentage of missing data
missing_data = 100 * pd.isnull(ranking_df.total_score).sum() / len(ranking_df)

# Drop the `total_score` column of `ranking_df`
ranking_df = ranking_df.drop('total_score', axis=1)

Memory Optimization

Memory can start to play a big part in how fast the pipelines can run. Without the “deep” flag turned on, Pandas won’t estimate memory consumption for the object dtype: category when dealing with categorical data, etc. int64 or even int16 takes less memory.

# Print the memory usage of `ranking_df` 
ranking_df.info()
1
2
3
4
5
6
7
8
9
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3686 entries, 0 to 4896
Data columns (total 4 columns):
year               3686 non-null int64
world_rank         3686 non-null object
university_name    3685 non-null object
name               3686 non-null object
dtypes: int64(1), object(3)
memory usage: 144.0+ KB
# Print the deep memory usage of `ranking_df` 
ranking_df.info(memory_usage="deep")
1
2
3
4
5
6
7
8
9
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3686 entries, 0 to 4896
Data columns (total 4 columns):
year               3686 non-null int64
world_rank         3686 non-null object
university_name    3685 non-null object
name               3686 non-null object
dtypes: int64(1), object(3)
memory usage: 803.1 KB

Cast Object Types

…to more appropriate ones.

def memory_change(input_df, column, dtype):
    df = input_df.copy()
    old = round(df[column].memory_usage(deep=True) / 1024, 2) # In KB
    new = round(df[column].astype(dtype).memory_usage(deep=True) / 1024, 2)# In KB
    change = round(100 * (old - new) / (old), 2)
    report = ("The inital memory footprint for {column} is: {old}KB.\n" 
              "The casted {column} now takes: {new}KB.\n"
              "A change of {change} %.").format(**locals())
    return report
# parameters:
# input_df, column, dtype
print(memory_change(ranking_df,'world_rank', 'int16'))
1
2
3
The inital memory footprint for world_rank is: 244.43KB.
The casted world_rank now takes: 36.0KB.
A change of 85.27 %.
print(memory_change(ranking_df,'university_name', 'category'))
1
2
3
The inital memory footprint for university_name is: 329.98KB.
The casted university_name now takes: 121.37KB.
A change of 63.22 %.
print(memory_change(ranking_df,'name', 'category'))
1
2
3
The inital memory footprint for name is: 257.49KB.
The casted name now takes: 32.6KB.
A change of 87.34 %.
# Cast `world_rank` as type `int16`
ranking_df.world_rank = ranking_df.world_rank.astype('int16')
# Cast `unversity_name` as type `category`
ranking_df.university_name = ranking_df.university_name.astype('category')
# Cast `name` as type `category`
ranking_df.name = ranking_df.name.astype('category')
# Double check the memory usage after type casting
ranking_df.info(memory_usage='deep')
1
2
3
4
5
6
7
8
9
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3686 entries, 0 to 4896
Data columns (total 4 columns):
year               3686 non-null int64
university_name    3685 non-null category
world_rank         3686 non-null int16
name               3686 non-null category
dtypes: category(2), int16(1), int64(1)
memory usage: 161.2 KB

From 803.1 KB!

We have a well-formed data set (i.e. it is in a tidy state) with an optimized memory footprint, data analysis can start.

Replace, Rank, Subset, groupby

Replace

‘Massachusetts Institute of Technology (MIT)’ and ‘Massachusetts Institute of Technology’ are two different records of the same university. Thus, change the first name to the latter.

# Query for the rows with university name 'Massachusetts Institute of Technology (MIT)'
print(ranking_df.query("university_name == 'Massachusetts Institute of Technology (MIT)'"))
1
2
3
4
5
6
      year                              university_name  world_rank      name
3016  2011  Massachusetts Institute of Technology (MIT)           3  shanghai
3516  2012  Massachusetts Institute of Technology (MIT)           3  shanghai
3801  2013  Massachusetts Institute of Technology (MIT)           4  shanghai
3899  2014  Massachusetts Institute of Technology (MIT)           3  shanghai
4399  2015  Massachusetts Institute of Technology (MIT)           3  shanghai
ranking_df.loc[ranking_df.university_name == 'Massachusetts Institute of Technology (MIT)', 'university_name'] = 'Massachusetts Institute of Technology'
ranking_df.university_name.head()
1
2
3
4
5
6
7
0                       Harvard University
1       California Institute of Technology
2    Massachusetts Institute of Technology
3                      Stanford University
4                     Princeton University
Name: university_name, dtype: category
Categories (785, object): [Aalborg University, Aalto University, Aarhus University, Aberystwyth University, ..., École Normale Supérieure, École Normale Supérieure de Lyon, École Polytechnique, École Polytechnique Fédérale de Lausanne]

Rank, Subset, groupby

To find the 5 (more generally n) top universities over the years, for each ranking system, here is how to do it in pseudo-code:

  • For each year (in the year column) and for each ranking system (in the name column):
    • Select the subset of the data for this given year and the given ranking system.
    • Select the 5 top universities and store them in a list.
    • Store the result in a dictionary with (year, name) as key and the list of the universities (in descending order) as the value.
# Load in `itertools`
import itertools

# Initialize `ranking`
ranking = {}

for year, name in itertools.product(common_years, ["times", "shanghai"]):
    s = (ranking_df.loc[lambda df: ((df.year == year) & (df.name == name)
                                    & (df.world_rank.isin(range(1,6)))), :]
                   .sort_values('world_rank', ascending=False)
                   .university_name)
    ranking[(year, name)] = list(s)


# Print `ranking`
print(ranking)
1
{(2011, 'times'): ['Princeton University', 'Stanford University', 'Massachusetts Institute of Technology', 'California Institute of Technology', 'Harvard University'], (2011, 'shanghai'): ['University of Cambridge', 'University of California, Berkeley', 'Massachusetts Institute of Technology', 'Stanford University', 'Harvard University'], (2012, 'times'): ['Princeton University', 'University of Oxford', 'Harvard University', 'Stanford University', 'California Institute of Technology'], (2012, 'shanghai'): ['University of Cambridge', 'University of California, Berkeley', 'Massachusetts Institute of Technology', 'Stanford University', 'Harvard University'], (2013, 'times'): ['Massachusetts Institute of Technology', 'Harvard University', 'Stanford University', 'University of Oxford', 'California Institute of Technology'], (2013, 'shanghai'): ['University of Cambridge', 'Massachusetts Institute of Technology', 'University of California, Berkeley', 'Stanford University', 'Harvard University'], (2014, 'times'): ['Massachusetts Institute of Technology', 'Stanford University', 'Harvard University', 'University of Oxford', 'California Institute of Technology'], (2014, 'shanghai'): ['University of Cambridge', 'University of California-Berkeley', 'Massachusetts Institute of Technology', 'Stanford University', 'Harvard University'], (2015, 'times'): ['University of Cambridge', 'Stanford University', 'University of Oxford', 'Harvard University', 'California Institute of Technology'], (2015, 'shanghai'): ['University of Cambridge', 'University of California, Berkeley', 'Massachusetts Institute of Technology', 'Stanford University', 'Harvard University']}

We have this ranking dictionary, let’s find out how much (in percentage) both ranking methods differ over the years: the two are 100% set-similar if the selected 5-top universities are the same even though they aren’t ranked the same.

# Import `defaultdict`
from collections import defaultdict

# Initialize `compare`
compare = defaultdict(list)

# Initialize `exact_similarity` and `set_similarity`
exact_similarity = {}
set_similarity = {}

for (year, method), universities in ranking.items():
    compare[year].append(universities)

for year, ranks in compare.items():
    set_similarity[year] = 100 * len(set(ranks[0]) & set(ranks[1])) / 5.0

# Print `set_similarity`  
print(set_similarity)
1
{2011: 60.0, 2012: 40.0, 2013: 60.0, 2014: 60.0, 2015: 60.0}

Is there a better, more idiomatic Pandas way?

# Construct a DataFrame with the top 5 universities 
top_5_df = ranking_df.loc[lambda df: df.world_rank.isin(range(1, 6)), :]

# Print the first rows of `top_5_df`
top_5_df.head()
year university_name world_rank name
0 2011 Harvard University 1 times
1 2011 California Institute of Technology 2 times
2 2011 Massachusetts Institute of Technology 3 times
3 2011 Stanford University 4 times
4 2011 Princeton University 5 times
top_5_df.tail()
year university_name world_rank name
4397 2015 Harvard University 1 shanghai
4398 2015 Stanford University 2 shanghai
4399 2015 Massachusetts Institute of Technology 3 shanghai
4400 2015 University of California, Berkeley 4 shanghai
4401 2015 University of Cambridge 5 shanghai
# Compute the similarity
def compute_set_similarity(s):
    pivoted = s.pivot(values='world_rank', columns='name', index='university_name').dropna()
    set_simlarity = 100 * len((set(pivoted['shanghai'].index) & set(pivoted['times'].index))) / 5
    return set_simlarity

# Group `top_5_df` by `year`    
grouped_df = top_5_df.groupby('year')

# Use `compute_set_similarity` to construct a DataFrame
setsimilarity_df = pd.DataFrame({'set_similarity': grouped_df.apply(compute_set_similarity)}).reset_index()

# Print the first rows of `setsimilarity_df`
setsimilarity_df.head()
year set_similarity
0 2011 60.0
1 2012 40.0
2 2013 60.0
3 2014 60.0
4 2015 60.0

Visualization

Matplotlib

import matplotlib.pyplot as plt
# Plot a scatterplot with `total_score` and `alumni`
shanghai_df.plot.scatter('total_score', 'alumni', c='year', colormap='viridis')

plt.show()

Remove values

There are some 0 values for the alumni column (0, -, NaN, etc.). Remove them.

# Replace `-` entries with NaN values
times_df['total_score'] = times_df['total_score'].replace("-", "NaN").astype('float')

# Drop all rows with NaN values for `num_students` 
times_df = times_df.dropna(subset=['num_students'], how='all')

# Cast the remaining rows with `num_students` as int
times_df['num_students'] = times_df['num_students'].astype('int')
# Plot a scatterplot with `total_score` and `num_students`
times_df.plot.scatter('total_score', 'num_students', c='year', colormap='viridis')

plt.show()

Seaborn

The Seaborn plotting tool is mainly used to create statistical plots that are visually appealing.

import seaborn as sns

# Set the Seaborn theme if desired
sns.set_style('darkgrid')
# Abbreviate country names of United States and United Kingdom
times_df['country'] = times_df['country'].replace("United States of America", "USA").replace("United Kingdom", "UK")

# Count the frequency of countries 
count = times_df['country'].value_counts()[:10]

# Convert the top 10 countries to a DataFrame 
df = count.to_frame()
# Reset the index 
#df.reset_index(level=0, inplace=True)
# or...
df['index1'] = df.index

df
country index1
USA 625 USA
UK 286 UK
Germany 150 Germany
Australia 117 Australia
Canada 108 Canada
Japan 98 Japan
Italy 94 Italy
China 82 China
Netherlands 75 Netherlands
France 73 France
# Rename the columns
df.columns = ['count', 'country',]
# Plot a barplot with `country` and `count`
sns.barplot(x='country', y='count', data=df)
sns.despine()

plt.show()

Filter rows

times_df_filtered = times_df.loc[times_df['country'].isin(['USA', 'UK', 'Canada', 'Australia', 'Germany'])]
# Barplot with `country` and `total_score`
sns.barplot(x='country', y='total_score', data=times_df_filtered)
sns.despine()

plt.show()

Correlation

import numpy as np

np.seterr(invalid='ignore')
1
{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}
sns.pairplot(times_df, hue='country')

plt.show()

g = sns.FacetGrid(times_df_filtered, col='country', hue='country')
g.map(sns.regplot, 'year', 'total_score').set(xlim=(2010, 2015), ylim=(0,100))
g.fig.subplots_adjust(wspace=.2)

sns.set(style="white")

# Compute the correlation matrix
corr = times_df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, linewidths=.5, ax=ax)

plt.show()

To go beyond

with groupby.