We will use the data from the world bank. It contains data on world economies for over half a century. The indicators are :
- Population
- Electricity consumption
- CO2 emissions
- Literacy rates
- Unemployment
- Mortality rates
Dictionaries for data science+ Writing a function to help you
# Define lists2dict()
def lists2dict(list1, list2):
"""Return a dictionary where list1 provides
the keys and list2 provides the values."""
# Zip lists: zipped_lists
zipped_lists = zip(list1, list2)
# Create a dictionary: rs_dict
rs_dict = dict(zipped_lists)
# Return the dictionary
return rs_dict
# Call lists2dict: rs_fxn
rs_fxn = lists2dict(feature_names,row_vals)
# Print rs_fxn
print(rs_fxn)Using a list comprehension
# Print the first two lists in row_lists print(row_lists[0]) print(row_lists[1]) # Turn list of lists into list of dicts: list_of_dicts list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists] # Print the first two dictionaries in list_of_dicts print(list_of_dicts[0]) print(list_of_dicts[1])
Turning this all into a dataframe :
# Import the pandas package import pandas as pd # Turn list of lists into list of dicts: list_of_dicts list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists] # Turn list of dicts into a DataFrame: df df = pd.DataFrame(list_of_dicts) # Print the head of the DataFrame print(df.head())
Streaming data
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Skip the column names
file.readline()
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Process only the first 1000 rows
for j in range(1000):
# Split the current line into a list: line
line = file.readline().split(',')
# Get the value for the first column: first_col
first_col = line[0]
# If the column value is in the dict, increment its value
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
# Else, add to the dict and set value to 1
else:
counts_dict[first_col] = 1
# Print the resulting dictionary
print(counts_dict)Writing a generator to load data in chunks
# Define read_large_file()
def read_large_file(file_object):
"""A generator function to read a large file lazily."""
# Loop indefinitely until the end of the file
while True:
# Read a line from the file: data
data = file_object.readline()
# Break if this is the end of the file
if not data:
break
# Yield the line of data
yield data
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Create a generator object for the file: gen_file
gen_file = read_large_file(file)
# Print the first three lines of the file
print(next(gen_file))
print(next(gen_file))
print(next(gen_file))Writing a generator to load data in chunks
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Iterate over the generator from read_large_file()
for line in read_large_file(file):
row = line.split(',')
first_col = row[0]
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
else:
counts_dict[first_col] = 1
# Print
print(counts_dict)You also use dataframes tro load the data in chunks :
# Import the pandas package
import pandas as pd
# Initialize reader object: df_reader
df_reader = pd.read_csv('ind_pop.csv',chunksize=10)
# Print two chunks
print(next(df_reader))
print(next(df_reader))
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('ind_pop_data.csv',chunksize=1000)
# Get the first DataFrame chunk: df_urb_pop
df_urb_pop = next(urb_pop_reader)
# Check out the head of the DataFrame
print(df_urb_pop.head())
# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode']=='CEB']
# Zip DataFrame columns of interest: pops
pops = zip(df_pop_ceb['Total Population'], df_pop_ceb['Urban population (% of total)'])
# Turn zip object into list: pops_list
pops_list = list(pops)
# Print pops_list
print(pops_list)# Define plot_pop()
def plot_pop(filename,country_code):
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv(filename, chunksize=1000)
# Initialize empty DataFrame: data
data = pd.DataFrame()
# Iterate over each DataFrame chunk
for df_urb_pop in urb_pop_reader:
# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
# Zip DataFrame columns of interest: pops
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])
# Turn zip object into list: pops_list
pops_list = list(pops)
# Use list comprehension to create new DataFrame column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
# Append DataFrame chunk to data: data
data = data.append(df_pop_ceb)
# Plot urban population data
data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()
# Set the filename: fn
fn = 'ind_pop_data.csv'
# Call plot_pop for country code 'CEB'
plot_pop('ind_pop_data.csv','CEB')
# Call plot_pop for country code 'ARB'
plot_pop('ind_pop_data.csv','ARB')