AP Prep/Notes

College Board talks about ideas like

  • Tools.
    "the ability to process data depends on users capabilities and their tools"
  • Combining Data.
    "combine county data sets"
  • Status on Data
    "determining the artist with the greatest attendance during a particular month"
  • Data poses challenge.
    "the need to clean data", "incomplete data"

Pandas and Data Frame

import pandas as pd

df = pd.read_csv('nba.csv')

print(df)
     RANK                     NAME TEAM  POS   AGE  GP   MPG  USG%    TO%  \
0       1              Joel Embiid  Phi  C-F  29.0  54  35.0  37.3   14.4   
1       2              Luka Doncic  Dal  F-G  24.0  57  36.3  38.2   14.3   
2       3           Damian Lillard  Por    G  32.7  54  36.3  33.8   13.5   
3       4    Giannis Antetokounmpo  Mil    F  28.3  52  32.5  38.9   16.2   
4       5  Shai Gilgeous-Alexander  Okc  G-F  24.7  57  35.6  32.7   12.4   
..    ...                      ...  ...  ...   ...  ..   ...   ...    ...   
578   579          Marko Simonovic  Chi    C  23.4   3   1.5   9.6    0.0   
579   580         Alondes Williams  Bro    G  23.7   1   5.3  16.8  100.0   
580   581        Lindell Wigginton  Mil    G  25.0   1   0.7   0.0    NaN   
581   582           Omer Yurtseven  Mia    C  24.7   1   7.2   6.1    0.0   
582   583           Jamaree Bouyea  Was    G  23.7   1   5.5   7.9    0.0   

     FTA  ...  APG  SPG  BPG  TPG   P+R   P+A  P+R+A    VI   ORtg   DRtg  
0    644  ...  4.1  1.1  1.6  3.4  43.4  37.5   47.5  12.7  123.4  103.8  
1    629  ...  8.0  1.5  0.5  3.7  41.6  41.0   49.6  14.5  120.6  108.4  
2    508  ...  7.3  0.9  0.3  3.2  36.8  39.4   44.1  11.4  127.7  116.2  
3    659  ...  5.5  0.7  0.8  4.0  43.1  36.7   48.6  15.6  115.8   99.2  
4    610  ...  5.6  1.7  1.1  2.9  36.0  36.8   41.6  10.6  124.7  109.5  
..   ...  ...  ...  ...  ...  ...   ...   ...    ...   ...    ...    ...  
578    0  ...  0.0  0.0  0.0  0.0   0.0   0.0    0.0   0.0    NaN    NaN  
579    0  ...  0.0  0.0  0.0  2.0   1.0   0.0    1.0   0.0    NaN   97.8  
580    0  ...  0.0  0.0  0.0  0.0   0.0   0.0    0.0   0.0    NaN    NaN  
581    0  ...  0.0  0.0  0.0  0.0   1.0   0.0    1.0   0.0   69.3  105.6  
582    0  ...  0.0  0.0  0.0  0.0   1.0   0.0    1.0   0.0    0.0   99.1  

[583 rows x 29 columns]
import pandas as pd

df = pd.read_csv('nba.csv')

df2 = df.drop(['FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG', 'RPG', 'APG', 'SPG', 'BPG', 'TPG', 'P+R', 'P+A', 'P+R+A', 'VI', 'ORtg', 'DRtg'], axis=1)

print(df2)
     RANK                     NAME TEAM  POS   AGE  GP   MPG  USG%    TO%  FTA
0       1              Joel Embiid  Phi  C-F  29.0  54  35.0  37.3   14.4  644
1       2              Luka Doncic  Dal  F-G  24.0  57  36.3  38.2   14.3  629
2       3           Damian Lillard  Por    G  32.7  54  36.3  33.8   13.5  508
3       4    Giannis Antetokounmpo  Mil    F  28.3  52  32.5  38.9   16.2  659
4       5  Shai Gilgeous-Alexander  Okc  G-F  24.7  57  35.6  32.7   12.4  610
..    ...                      ...  ...  ...   ...  ..   ...   ...    ...  ...
578   579          Marko Simonovic  Chi    C  23.4   3   1.5   9.6    0.0    0
579   580         Alondes Williams  Bro    G  23.7   1   5.3  16.8  100.0    0
580   581        Lindell Wigginton  Mil    G  25.0   1   0.7   0.0    NaN    0
581   582           Omer Yurtseven  Mia    C  24.7   1   7.2   6.1    0.0    0
582   583           Jamaree Bouyea  Was    G  23.7   1   5.5   7.9    0.0    0

[583 rows x 10 columns]
import pandas as pd


#read csv and sort 'Duration' largest to smallest
df3 = df2.sort_values(by=['AGE'], ascending=False)

print("\n"+"--Oldest Top 10---------" + "\n") 

print(df3.head(10))

print("\n"+ "--Youngest Bottom 10------" + "\n")

print(df3.tail(10))
--Oldest Top 10---------

     RANK            NAME TEAM  POS   AGE  GP   MPG  USG%   TO%  FTA
568   569   Udonis Haslem  Mia    F  42.8   6   7.6  12.5   0.0    2
528   529  Andre Iguodala  Gol  G-F  39.1   7  13.8   9.1  42.9    0
8       9    LeBron James  Lal    F  38.2  47  36.1  33.2  12.2  295
447   448     P.J. Tucker  Phi    F  37.9  64  25.7   6.3  17.0   22
117   118      Chris Paul  Pho    G  37.9  46  32.1  19.2  15.6  137
450   451      Taj Gibson  Was    F  37.7  41   9.9  15.3  16.5   36
145   146      Kyle Lowry  Mia    G  37.0  45  33.3  17.0  17.1  119
437   438     George Hill  Ind    G  36.9   3  14.1  10.9  27.3    0
375   376     George Hill  Mil    G  36.9  35  19.1  11.3  16.9   46
307   308    Goran Dragic  Chi    G  36.9  50  15.6  20.9  15.6   41

--Youngest Bottom 10------

     RANK              NAME TEAM POS   AGE  GP   MPG  USG%   TO%  FTA
171   172     Jeremy Sochan  San   F  19.8  53  26.3  19.5  15.1  108
259   260    Shaedon Sharpe  Por   G  19.8  67  20.5  16.7  10.6   60
346   347     Ousmane Dieng  Okc   F  19.8  25  16.4  15.4  10.5   17
211   212    Malaki Branham  San   F  19.8  52  22.3  18.1  12.1   50
138   139  Jabari Smith Jr.  Hou   F  19.8  64  30.3  18.4  11.4  163
461   462   Dominick Barlow  San   F  19.8  14  11.0  12.6  19.4   13
532   533     Kendall Brown  Ind   G  19.8   6   6.7  10.3  21.7    2
535   536      Trevor Keels  Nyk   G  19.5   2   2.0  33.0   0.0    0
225   226        AJ Griffin  Atl   F  19.5  60  19.7  17.7   7.1   34
232   233       Jalen Duren  Det   C  19.3  54  24.9  13.4  17.9  123
print(df2.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RANK    583 non-null    int64  
 1   NAME    583 non-null    object 
 2   TEAM    583 non-null    object 
 3   POS     583 non-null    object 
 4   AGE     583 non-null    float64
 5   GP      583 non-null    int64  
 6   MPG     583 non-null    float64
 7   USG%    583 non-null    float64
 8   TO%     581 non-null    float64
 9   FTA     583 non-null    int64  
dtypes: float64(4), int64(3), object(3)
memory usage: 45.7+ KB
None
import pandas as pd

#the data can be stored as a python dictionary
dict = {
  "Safin": [16, 7132006, 1],
  "Johnny": [16, 6222006, 1]
}
#stores the data in a data frame
print("-------------Dict_to_DF------------------")
df = pd.DataFrame(dict)
print(df)

print("----------Dict_to_DF_labels--------------")

#or with the index argument, you can label rows.
df = pd.DataFrame(dict, index = ["age", "bday", "siblings"])
print(df)
-------------Dict_to_DF------------------
     Safin   Johnny
0       16       16
1  7132006  6222006
2        1        1
----------Dict_to_DF_labels--------------
            Safin   Johnny
age            16       16
bday      7132006  6222006
siblings        1        1
print(df.info())
<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, age to siblings
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Safin   3 non-null      int64
 1   Johnny  3 non-null      int64
dtypes: int64(2)
memory usage: 72.0+ bytes
None