We will use the data from the world bank. It contains data on world economies for over half a century. The indicators are :
- Population
- Electricity consumption
- CO2 emissions
- Literacy rates
- Unemployment
- Mortality rates
Dictionaries for data science+ Writing a function to help you
# Define lists2dict() def lists2dict(list1, list2): """Return a dictionary where list1 provides the keys and list2 provides the values.""" # Zip lists: zipped_lists zipped_lists = zip(list1, list2) # Create a dictionary: rs_dict rs_dict = dict(zipped_lists) # Return the dictionary return rs_dict # Call lists2dict: rs_fxn rs_fxn = lists2dict(feature_names,row_vals) # Print rs_fxn print(rs_fxn)
Using a list comprehension
# Print the first two lists in row_lists print(row_lists[0]) print(row_lists[1]) # Turn list of lists into list of dicts: list_of_dicts list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists] # Print the first two dictionaries in list_of_dicts print(list_of_dicts[0]) print(list_of_dicts[1])
Turning this all into a dataframe :
# Import the pandas package import pandas as pd # Turn list of lists into list of dicts: list_of_dicts list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists] # Turn list of dicts into a DataFrame: df df = pd.DataFrame(list_of_dicts) # Print the head of the DataFrame print(df.head())
Streaming data
# Open a connection to the file with open('world_dev_ind.csv') as file: # Skip the column names file.readline() # Initialize an empty dictionary: counts_dict counts_dict = {} # Process only the first 1000 rows for j in range(1000): # Split the current line into a list: line line = file.readline().split(',') # Get the value for the first column: first_col first_col = line[0] # If the column value is in the dict, increment its value if first_col in counts_dict.keys(): counts_dict[first_col] += 1 # Else, add to the dict and set value to 1 else: counts_dict[first_col] = 1 # Print the resulting dictionary print(counts_dict)
Writing a generator to load data in chunks
# Define read_large_file() def read_large_file(file_object): """A generator function to read a large file lazily.""" # Loop indefinitely until the end of the file while True: # Read a line from the file: data data = file_object.readline() # Break if this is the end of the file if not data: break # Yield the line of data yield data # Open a connection to the file with open('world_dev_ind.csv') as file: # Create a generator object for the file: gen_file gen_file = read_large_file(file) # Print the first three lines of the file print(next(gen_file)) print(next(gen_file)) print(next(gen_file))
Writing a generator to load data in chunks
# Initialize an empty dictionary: counts_dict counts_dict = {} # Open a connection to the file with open('world_dev_ind.csv') as file: # Iterate over the generator from read_large_file() for line in read_large_file(file): row = line.split(',') first_col = row[0] if first_col in counts_dict.keys(): counts_dict[first_col] += 1 else: counts_dict[first_col] = 1 # Print print(counts_dict)
You also use dataframes tro load the data in chunks :
# Import the pandas package import pandas as pd # Initialize reader object: df_reader df_reader = pd.read_csv('ind_pop.csv',chunksize=10) # Print two chunks print(next(df_reader)) print(next(df_reader))
# Initialize reader object: urb_pop_reader urb_pop_reader = pd.read_csv('ind_pop_data.csv',chunksize=1000) # Get the first DataFrame chunk: df_urb_pop df_urb_pop = next(urb_pop_reader) # Check out the head of the DataFrame print(df_urb_pop.head()) # Check out specific country: df_pop_ceb df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode']=='CEB'] # Zip DataFrame columns of interest: pops pops = zip(df_pop_ceb['Total Population'], df_pop_ceb['Urban population (% of total)']) # Turn zip object into list: pops_list pops_list = list(pops) # Print pops_list print(pops_list)
# Define plot_pop() def plot_pop(filename,country_code): # Initialize reader object: urb_pop_reader urb_pop_reader = pd.read_csv(filename, chunksize=1000) # Initialize empty DataFrame: data data = pd.DataFrame() # Iterate over each DataFrame chunk for df_urb_pop in urb_pop_reader: # Check out specific country: df_pop_ceb df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code] # Zip DataFrame columns of interest: pops pops = zip(df_pop_ceb['Total Population'], df_pop_ceb['Urban population (% of total)']) # Turn zip object into list: pops_list pops_list = list(pops) # Use list comprehension to create new DataFrame column 'Total Urban Population' df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list] # Append DataFrame chunk to data: data data = data.append(df_pop_ceb) # Plot urban population data data.plot(kind='scatter', x='Year', y='Total Urban Population') plt.show() # Set the filename: fn fn = 'ind_pop_data.csv' # Call plot_pop for country code 'CEB' plot_pop('ind_pop_data.csv','CEB') # Call plot_pop for country code 'ARB' plot_pop('ind_pop_data.csv','ARB')