In [1]:
import numpy as np
import pandas as pd
Data Frames¶
In [11]:
data = [
['Nissan', 'Stanza', 1991, 138, 4, 'MANUAL', 'sedan', 2000],
['Hyundai', 'Sonata', 2017, None, 4, 'AUTOMATIC', 'Sedan', 27150],
['Lotus', 'Elise', 2010, 218, 4, 'MANUAL', 'convertible', 54990],
['GMC', 'Acadia', 2017, 194, 4, 'AUTOMATIC', '4dr SUV', 34450],
['Nissan', 'Frontier', 2017, 261, 6, 'MANUAL', 'Pickup', 32340],
]
columns = [
'Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',
'Transmission Type', 'Vehicle_Style', 'MSRP'
]
In [12]:
df = pd.DataFrame(data, columns=columns)
In [13]:
df
Out[13]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
0 | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
1 | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
2 | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
3 | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
4 | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
Alternatively, we can use a list of dictionaries to create a dataframe:
In [14]:
data = [
{
"Make": "Nissan",
"Model": "Stanza",
"Year": 1991,
"Engine HP": 138.0,
"Engine Cylinders": 4,
"Transmission Type": "MANUAL",
"Vehicle_Style": "sedan",
"MSRP": 2000
},
{
"Make": "Hyundai",
"Model": "Sonata",
"Year": 2017,
"Engine HP": None,
"Engine Cylinders": 4,
"Transmission Type": "AUTOMATIC",
"Vehicle_Style": "Sedan",
"MSRP": 27150
},
{
"Make": "Lotus",
"Model": "Elise",
"Year": 2010,
"Engine HP": 218.0,
"Engine Cylinders": 4,
"Transmission Type": "MANUAL",
"Vehicle_Style": "convertible",
"MSRP": 54990
},
{
"Make": "GMC",
"Model": "Acadia",
"Year": 2017,
"Engine HP": 194.0,
"Engine Cylinders": 4,
"Transmission Type": "AUTOMATIC",
"Vehicle_Style": "4dr SUV",
"MSRP": 34450
},
{
"Make": "Nissan",
"Model": "Frontier",
"Year": 2017,
"Engine HP": 261.0,
"Engine Cylinders": 6,
"Transmission Type": "MANUAL",
"Vehicle_Style": "Pickup",
"MSRP": 32340
}
]
In [15]:
df2 = pd.DataFrame(data)
In [16]:
df2
Out[16]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
0 | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
1 | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
2 | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
3 | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
4 | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
In [19]:
df.head(2)
Out[19]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
0 | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
1 | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
Series¶
In [20]:
Out[20]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
0 | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
1 | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
2 | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
3 | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
4 | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
In [21]:
df.Make #This does not work with column names with spaces
Out[21]:
0 Nissan 1 Hyundai 2 Lotus 3 GMC 4 Nissan Name: Make, dtype: object
In [22]:
df['Make']
Out[22]:
0 Nissan 1 Hyundai 2 Lotus 3 GMC 4 Nissan Name: Make, dtype: object
In [23]:
df[['Make', 'Model', 'MSRP']]
Out[23]:
Make | Model | MSRP | |
---|---|---|---|
0 | Nissan | Stanza | 2000 |
1 | Hyundai | Sonata | 27150 |
2 | Lotus | Elise | 54990 |
3 | GMC | Acadia | 34450 |
4 | Nissan | Frontier | 32340 |
In [24]:
df['id'] = [1, 2, 3, 4, 5]
In [25]:
df
Out[25]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | id | |
---|---|---|---|---|---|---|---|---|---|
0 | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 | 1 |
1 | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 | 2 |
2 | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 | 3 |
3 | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 | 4 |
4 | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 | 5 |
In [26]:
del df['id']
In [27]:
df
Out[27]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
0 | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
1 | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
2 | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
3 | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
4 | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
In [ ]:
Index¶
In [28]:
df.index
Out[28]:
RangeIndex(start=0, stop=5, step=1)
In [29]:
df.Make
Out[29]:
0 Nissan 1 Hyundai 2 Lotus 3 GMC 4 Nissan Name: Make, dtype: object
In [30]:
df.Make.index
Out[30]:
RangeIndex(start=0, stop=5, step=1)
In [31]:
df.loc[1] #Row
Out[31]:
Make Hyundai Model Sonata Year 2017 Engine HP NaN Engine Cylinders 4 Transmission Type AUTOMATIC Vehicle_Style Sedan MSRP 27150 Name: 1, dtype: object
In [32]:
df.loc[[1, 2]]
Out[32]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
1 | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
2 | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
In [33]:
df.index = ['a', 'b', 'c', 'd', 'e']
In [34]:
df.loc[['b', 'c']]
Out[34]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
b | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
c | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
In [35]:
df.iloc[1]
Out[35]:
Make Hyundai Model Sonata Year 2017 Engine HP NaN Engine Cylinders 4 Transmission Type AUTOMATIC Vehicle_Style Sedan MSRP 27150 Name: b, dtype: object
In [36]:
df.reset_index()
Out[36]:
index | Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|---|
0 | a | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
1 | b | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
2 | c | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
3 | d | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
4 | e | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
In [37]:
df.reset_index(drop=True) #Creates new dataframe
Out[37]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
0 | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
1 | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
2 | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
3 | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
4 | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
In [38]:
df #df is still with the alphabet index even though we reset it
Out[38]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
a | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
b | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
c | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
d | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
e | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
In [41]:
df3 = df.reset_index(drop=True) #Assign it to a new variable to keep
In [42]:
df3
Out[42]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
0 | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
1 | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
2 | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
3 | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
4 | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
Accessing elements¶
In [43]:
# was done above
Element-wise operations¶
In [44]:
df['Engine HP']
Out[44]:
a 138.0 b NaN c 218.0 d 194.0 e 261.0 Name: Engine HP, dtype: float64
In [45]:
df['Engine HP'] / 100
Out[45]:
a 1.38 b NaN c 2.18 d 1.94 e 2.61 Name: Engine HP, dtype: float64
In [46]:
df['Engine HP'] * 2
Out[46]:
a 276.0 b NaN c 436.0 d 388.0 e 522.0 Name: Engine HP, dtype: float64
In [47]:
df['Year'] > 2015
Out[47]:
a False b True c False d True e True Name: Year, dtype: bool
In [ ]:
Filtering¶
In [48]:
df[
df['Year'] >= 2015
]
Out[48]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
b | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
d | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
e | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
In [49]:
df[
df['Make'] == 'Nissan'
]
Out[49]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
a | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
e | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
In [50]:
df[
(df['Make'] == 'Nissan') & (df['Year'] >= 2015)
]
Out[50]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
e | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
String operations¶
In [51]:
df
Out[51]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
a | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
b | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
c | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
d | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
e | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
In [52]:
df['Vehicle_Style']
Out[52]:
a sedan b Sedan c convertible d 4dr SUV e Pickup Name: Vehicle_Style, dtype: object
In [53]:
df['Vehicle_Style'].str.lower()
Out[53]:
a sedan b sedan c convertible d 4dr suv e pickup Name: Vehicle_Style, dtype: object
In [54]:
df['Vehicle_Style'].str.replace(' ', '_')
Out[54]:
a sedan b Sedan c convertible d 4dr_SUV e Pickup Name: Vehicle_Style, dtype: object
In [55]:
df['Vehicle_Style'].str.replace(' ', '_').str.lower()
Out[55]:
a sedan b sedan c convertible d 4dr_suv e pickup Name: Vehicle_Style, dtype: object
In [56]:
df
Out[56]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
a | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
b | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | Sedan | 27150 |
c | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
d | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr SUV | 34450 |
e | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | Pickup | 32340 |
In [57]:
df['Vehicle_Style']=df['Vehicle_Style'].str.replace(' ', '_').str.lower()
In [58]:
df
Out[58]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
a | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
b | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | sedan | 27150 |
c | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
d | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr_suv | 34450 |
e | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | pickup | 32340 |
Summarazing operations¶
In [59]:
df.MSRP.max()
Out[59]:
54990
In [60]:
df.MSRP.min()
Out[60]:
2000
In [61]:
df.MSRP.describe()
Out[61]:
count 5.000000 mean 30186.000000 std 18985.044904 min 2000.000000 25% 27150.000000 50% 32340.000000 75% 34450.000000 max 54990.000000 Name: MSRP, dtype: float64
In [62]:
df.describe()
Out[62]:
Year | Engine HP | Engine Cylinders | MSRP | |
---|---|---|---|---|
count | 5.000000 | 4.00000 | 5.000000 | 5.000000 |
mean | 2010.400000 | 202.75000 | 4.400000 | 30186.000000 |
std | 11.260551 | 51.29896 | 0.894427 | 18985.044904 |
min | 1991.000000 | 138.00000 | 4.000000 | 2000.000000 |
25% | 2010.000000 | 180.00000 | 4.000000 | 27150.000000 |
50% | 2017.000000 | 206.00000 | 4.000000 | 32340.000000 |
75% | 2017.000000 | 228.75000 | 4.000000 | 34450.000000 |
max | 2017.000000 | 261.00000 | 6.000000 | 54990.000000 |
In [63]:
df.describe().round(2)
Out[63]:
Year | Engine HP | Engine Cylinders | MSRP | |
---|---|---|---|---|
count | 5.00 | 4.00 | 5.00 | 5.00 |
mean | 2010.40 | 202.75 | 4.40 | 30186.00 |
std | 11.26 | 51.30 | 0.89 | 18985.04 |
min | 1991.00 | 138.00 | 4.00 | 2000.00 |
25% | 2010.00 | 180.00 | 4.00 | 27150.00 |
50% | 2017.00 | 206.00 | 4.00 | 32340.00 |
75% | 2017.00 | 228.75 | 4.00 | 34450.00 |
max | 2017.00 | 261.00 | 6.00 | 54990.00 |
In [64]:
df.Make
Out[64]:
a Nissan b Hyundai c Lotus d GMC e Nissan Name: Make, dtype: object
In [65]:
df.Make.nunique()
Out[65]:
4
In [66]:
df.nunique()
Out[66]:
Make 4 Model 5 Year 3 Engine HP 4 Engine Cylinders 2 Transmission Type 2 Vehicle_Style 4 MSRP 5 dtype: int64
In [67]:
df
Out[67]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
a | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
b | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | sedan | 27150 |
c | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
d | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr_suv | 34450 |
e | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | pickup | 32340 |
Missing values¶
In [68]:
df.isnull()
Out[68]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
a | False | False | False | False | False | False | False | False |
b | False | False | False | True | False | False | False | False |
c | False | False | False | False | False | False | False | False |
d | False | False | False | False | False | False | False | False |
e | False | False | False | False | False | False | False | False |
In [70]:
df.isnull().sum()
Out[70]:
Make 0 Model 0 Year 0 Engine HP 1 Engine Cylinders 0 Transmission Type 0 Vehicle_Style 0 MSRP 0 dtype: int64
Grouping¶
SELECT
transmission_type,
AVG(MSRP)
FROM
cars
GROUP BY
transmission_type
In [71]:
df.groupby('Transmission Type').MSRP.mean()
Out[71]:
Transmission Type AUTOMATIC 30800.000000 MANUAL 29776.666667 Name: MSRP, dtype: float64
In [72]:
df.groupby('Transmission Type').MSRP.min()
Out[72]:
Transmission Type AUTOMATIC 27150 MANUAL 2000 Name: MSRP, dtype: int64
In [73]:
df.groupby('Transmission Type').MSRP.max()
Out[73]:
Transmission Type AUTOMATIC 34450 MANUAL 54990 Name: MSRP, dtype: int64
In [ ]:
Getting the NumPy arrays¶
In [74]:
df.MSRP.values
Out[74]:
array([ 2000, 27150, 54990, 34450, 32340])
In [75]:
df
Out[75]:
Make | Model | Year | Engine HP | Engine Cylinders | Transmission Type | Vehicle_Style | MSRP | |
---|---|---|---|---|---|---|---|---|
a | Nissan | Stanza | 1991 | 138.0 | 4 | MANUAL | sedan | 2000 |
b | Hyundai | Sonata | 2017 | NaN | 4 | AUTOMATIC | sedan | 27150 |
c | Lotus | Elise | 2010 | 218.0 | 4 | MANUAL | convertible | 54990 |
d | GMC | Acadia | 2017 | 194.0 | 4 | AUTOMATIC | 4dr_suv | 34450 |
e | Nissan | Frontier | 2017 | 261.0 | 6 | MANUAL | pickup | 32340 |
In [76]:
df.to_dict(orient='records')
Out[76]:
[{'Make': 'Nissan', 'Model': 'Stanza', 'Year': 1991, 'Engine HP': 138.0, 'Engine Cylinders': 4, 'Transmission Type': 'MANUAL', 'Vehicle_Style': 'sedan', 'MSRP': 2000}, {'Make': 'Hyundai', 'Model': 'Sonata', 'Year': 2017, 'Engine HP': nan, 'Engine Cylinders': 4, 'Transmission Type': 'AUTOMATIC', 'Vehicle_Style': 'sedan', 'MSRP': 27150}, {'Make': 'Lotus', 'Model': 'Elise', 'Year': 2010, 'Engine HP': 218.0, 'Engine Cylinders': 4, 'Transmission Type': 'MANUAL', 'Vehicle_Style': 'convertible', 'MSRP': 54990}, {'Make': 'GMC', 'Model': 'Acadia', 'Year': 2017, 'Engine HP': 194.0, 'Engine Cylinders': 4, 'Transmission Type': 'AUTOMATIC', 'Vehicle_Style': '4dr_suv', 'MSRP': 34450}, {'Make': 'Nissan', 'Model': 'Frontier', 'Year': 2017, 'Engine HP': 261.0, 'Engine Cylinders': 6, 'Transmission Type': 'MANUAL', 'Vehicle_Style': 'pickup', 'MSRP': 32340}]
In [ ]:
Leave a Reply