Case Study (autos) : Summary Statistics

In [14]:
# Importing data 

import pandas as pd
import numpy as np

url= "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"


df= pd.read_csv(url, header = None)


#Adding Header

headers = ["symboling", "normalized_losses", "make", "fuel-type", "aspiration", "num_of_doors", "body_style","drive_wheels",
           "engine_location", "wheel_base", "length", "width", "height", "curb_weight", "engine_type", "num_of_cylinders",
           "engine_size", "fuel_system",  "bore",  "stroke", "compression_ratio", "horsepower",  "peak_rpm",  "city_mpg",
           "highway_mpg", "price" ]
           
df.columns = headers 

Summary Statistics : df.describe()

df.describe() -> skips object type columns: to show all columns, Add -> include = "All"

NaN -> meaning Not A Number

count : Number of items in each column
unique : Number of distinct objects in a column (for object type)
top : Most frequent occuring object (for object type)
freq : the number of times objects occurred (for object type)
mean : Average value of each column
std : Standard Deviation of each column
min : Minimun value of each column
25%
50%
75%
max : Maximum value of each column
In [15]:
# It does not show columns with object type

df.describe()
Out[15]:
symboling wheel_base length width height curb_weight engine_size compression_ratio city_mpg highway_mpg
count 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000
mean 0.834146 98.756585 174.049268 65.907805 53.724878 2555.565854 126.907317 10.142537 25.219512 30.751220
std 1.245307 6.021776 12.337289 2.145204 2.443522 520.680204 41.642693 3.972040 6.542142 6.886443
min -2.000000 86.600000 141.100000 60.300000 47.800000 1488.000000 61.000000 7.000000 13.000000 16.000000
25% 0.000000 94.500000 166.300000 64.100000 52.000000 2145.000000 97.000000 8.600000 19.000000 25.000000
50% 1.000000 97.000000 173.200000 65.500000 54.100000 2414.000000 120.000000 9.000000 24.000000 30.000000
75% 2.000000 102.400000 183.100000 66.900000 55.500000 2935.000000 141.000000 9.400000 30.000000 34.000000
max 3.000000 120.900000 208.100000 72.300000 59.800000 4066.000000 326.000000 23.000000 49.000000 54.000000
In [16]:
# for showing all columns add include = "all"

df.describe(include = "all")
Out[16]:
symboling normalized_losses make fuel-type aspiration num_of_doors body_style drive_wheels engine_location wheel_base ... engine_size fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg highway_mpg price
count 205.000000 205 205 205 205 205 205 205 205 205.000000 ... 205.000000 205 205 205 205.000000 205 205 205.000000 205.000000 205
unique NaN 52 22 2 2 3 5 3 2 NaN ... NaN 8 39 37 NaN 60 24 NaN NaN 187
top NaN ? toyota gas std four sedan fwd front NaN ... NaN mpfi 3.62 3.40 NaN 68 5500 NaN NaN ?
freq NaN 41 32 185 168 114 96 120 202 NaN ... NaN 94 23 20 NaN 19 37 NaN NaN 4
mean 0.834146 NaN NaN NaN NaN NaN NaN NaN NaN 98.756585 ... 126.907317 NaN NaN NaN 10.142537 NaN NaN 25.219512 30.751220 NaN
std 1.245307 NaN NaN NaN NaN NaN NaN NaN NaN 6.021776 ... 41.642693 NaN NaN NaN 3.972040 NaN NaN 6.542142 6.886443 NaN
min -2.000000 NaN NaN NaN NaN NaN NaN NaN NaN 86.600000 ... 61.000000 NaN NaN NaN 7.000000 NaN NaN 13.000000 16.000000 NaN
25% 0.000000 NaN NaN NaN NaN NaN NaN NaN NaN 94.500000 ... 97.000000 NaN NaN NaN 8.600000 NaN NaN 19.000000 25.000000 NaN
50% 1.000000 NaN NaN NaN NaN NaN NaN NaN NaN 97.000000 ... 120.000000 NaN NaN NaN 9.000000 NaN NaN 24.000000 30.000000 NaN
75% 2.000000 NaN NaN NaN NaN NaN NaN NaN NaN 102.400000 ... 141.000000 NaN NaN NaN 9.400000 NaN NaN 30.000000 34.000000 NaN
max 3.000000 NaN NaN NaN NaN NaN NaN NaN NaN 120.900000 ... 326.000000 NaN NaN NaN 23.000000 NaN NaN 49.000000 54.000000 NaN

11 rows × 26 columns

A concise summary of the dataframe: df.info()

  • excluding non-null counts., add -> null_counts = False
In [17]:
# A concise summary of the dataframe

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized_losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num_of_doors       205 non-null    object 
 6   body_style         205 non-null    object 
 7   drive_wheels       205 non-null    object 
 8   engine_location    205 non-null    object 
 9   wheel_base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    object 
 15  num_of_cylinders   205 non-null    object 
 16  engine_size        205 non-null    int64  
 17  fuel_system        205 non-null    object 
 18  bore               205 non-null    object 
 19  stroke             205 non-null    object 
 20  compression_ratio  205 non-null    float64
 21  horsepower         205 non-null    object 
 22  peak_rpm           205 non-null    object 
 23  city_mpg           205 non-null    int64  
 24  highway_mpg        205 non-null    int64  
 25  price              205 non-null    object 
dtypes: float64(5), int64(5), object(16)
memory usage: 41.8+ KB
In [18]:
# to exclude non-null counts, add null_counts = False

df.info(null_counts = False)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Dtype  
---  ------             -----  
 0   symboling          int64  
 1   normalized_losses  object 
 2   make               object 
 3   fuel-type          object 
 4   aspiration         object 
 5   num_of_doors       object 
 6   body_style         object 
 7   drive_wheels       object 
 8   engine_location    object 
 9   wheel_base         float64
 10  length             float64
 11  width              float64
 12  height             float64
 13  curb_weight        int64  
 14  engine_type        object 
 15  num_of_cylinders   object 
 16  engine_size        int64  
 17  fuel_system        object 
 18  bore               object 
 19  stroke             object 
 20  compression_ratio  float64
 21  horsepower         object 
 22  peak_rpm           object 
 23  city_mpg           int64  
 24  highway_mpg        int64  
 25  price              object 
dtypes: float64(5), int64(5), object(16)
memory usage: 41.8+ KB