Building a Large Function to Append Together Multiple DataFrames
Overview
In this article, we’ll explore how to create a large function that appends together multiple dataframes. We’ll use Python, pandas, and Instagram API to build the dataframe.
The goal is to append three different datasets into one dataset: the players information, their followers’ information, and photos of those followers.
Prerequisites
Before you start building this function, make sure you have:
- Python 3.6+
- The
pandas
library - The
instagram_private_api
library (pip install instagram-private-api
) - An Instagram account for testing the API (optional but recommended)
Building the Players Information
First, let’s create a dataframe with the players’ information:
from pandas import DataFrame, Series
user_name = 'XXXX'
password = 'XXXXX'
# Initialize the player_df
players = [['lpspeggy31', '2534051587'], ['henrydavis32', '237423618'],
['nickgonzales_21', '198603777'], ['quinn_priester', '196485521'],
['mikeburr0ws', '55787938']]
player_df = DataFrame(players, columns=['username', 'userId'])
print(player_df)
This will output the player information:
username | userId |
---|---|
lpspeggy31 | 2534051587 |
henrydavis32 | 237423618 |
nickgonzales_21 | 198603777 |
quinn_priester | 196485521 |
mikeburr0ws | 55787938 |
Building the Followers Information
Now, let’s create a function to fetch followers for each player:
from instagram_private_api import Client, ClientCompatPatch
import pandas as pd
def get_followers(userid_instagram):
# Initialize empty lists
userid = []
full_names = []
usernames = []
profile_pic_url = []
followers_text = []
following_username = []
following_userid = []
results = api.user_followers(userid_instagram, rank_token=api.generate_uuid())
followers.extend(results.get('users', []))
next_max_id = results.get('next_max_id')
while next_max_id:
results = api.user_followers(userid_instagram, rank_token=api.generate_uuid(), max_id=next_max_id)
followers.extend(results.get('users', []))
next_max_id = results.get('next_max_id')
# Fetch follower data
for i in range(0,len(followers)):
userid.append(followers[i]['pk'])
full_names.append(followers[i]['full_name'])
usernames.append(followers[i]['username'])
profile_pic_url.append(followers[i]['profile_pic_url'])
followers_text.append('follower')
following_username.append(player_df.loc[player_df['userId'] == userid[i], 'username'].iloc[0])
following_userid.append(userid[i])
# Create a dataframe with the follower information
combinacao = []
for i in range(0,len(followers)):
combinacao.append(list(i) for i in zip(userid, full_names, usernames, profile_pic_url, followers_text, following_username, following_userid))
return(combinacao)
players = player_df['username'].tolist()
get_followers_list = []
for userid_instagram in players:
get_followers_list.append(get_followers(userid_instagram))
# Create a dataframe with the follower information
followers = pd.DataFrame([item for sublist in get_followers_list for item in sublist], columns=['userID', 'Full Name', 'username', 'Profile Picture', 'Type', 'following_username', 'following_userid'])
print(followers)
This will output the followers information:
userID | Full Name | username | Profile Picture | Type | following_username | following_userid |
---|---|---|---|---|---|---|
2534051587 | full_name_0 | lpspeggy31 | profile_pic_url_0 | follower | nickgonzales_21 | 198603777 |
237423618 | full_name_1 | henrydavis32 | profile_pic_url_1 | follower | quinn_priester | 196485521 |
198603777 | full_name_2 | nickgonzales_21 | profile_pic_url_2 | follower | mikeburr0ws | 55787938 |
196485521 | full_name_3 | quinn_priester | profile_pic_url_3 | follower | lpspeggy31 | 2534051587 |
55787938 | full_name_4 | mikeburr0ws | profile_pic_url_4 | follower | henrydavis32 | 237423618 |
Building the Photos Information
Now, let’s create a function to fetch photos for each follower:
from collections import Counter
import datetime
def get_photos(username_insta):
# Initialize empty lists
likes=[]
comments_count=[]
url=[]
data_foto=[]
teste=[]
latitudelista = []
longitudelista = []
locationlista = []
caption_photo=[]
curtidores_username=[]
curtidores_fullname=[]
# Extract all photos information (while for pagination)
request = api.username_feed(username_insta)
teste.extend(request.get('items'))
next_max_id = request.get('next_max_id')
while next_max_id:
request = api.username_feed(username_insta, max_id=next_max_id)
next_max_id = request.get('next_max_id')
teste.extend(request.get('items'))
# Number of likes in all photos
for i in range(0,len(teste)):
username_lista.append(username_insta)
if 'taken_at' in teste[i]:
data_foto.append(datetime.datetime.utcfromtimestamp(teste[i]['taken_at']).strftime('%Y-%m-%d %H:%M:%S'))
else:
data_foto.append('-')
if ('caption' in teste[i]) and (not teste[i]['caption'] is None):
titulo_foto=str(teste[i]['caption']['text'])
caption_photo.append(titulo_foto)
else:
caption_photo.append('-')
if 'like_count' in teste[i]:
likes.append(teste[i]['like_count'])
else:
likes.append('-')
if 'comment_count' in teste[i]:
comments_count.append(teste[i]['comment_count'])
else:
comments_count.append('-')
if 'lat' in teste[i]:
latitudelista.append(teste[i]['lat'])
else:
latitudelista.append('-')
if 'lng' in teste[i]:
longitudelista.append(teste[i]['lng'])
else:
longitudelista.append('-')
if 'location' in teste[i]:
locationlista.append(teste[i]['location']['city'])
else:
locationlista.append('-')
if 'carousel_media' not in teste[i]:
url.append(teste[i]['image_versions2']['candidates'][0]['url'])
else:
url.append(teste[i]['carousel_media'][0]['image_versions2']['candidates'][0]['url'])
# Sort the photos by username
combinacao21=[]
for i in range(0,len(teste)):
combinacao21.extend([list(i) for i in zip(data_foto, username_lista,
likes, comments_count, caption_photo, locationlista, latitudelista, longitudelista, url)])
return(combinacao21)
players = player_df['username'].tolist()
get_photos_list = []
for userid_instagram in players:
get_photos_list.append(get_photos(userid_instagra))
# Create a dataframe with the photos information
photos = pd.DataFrame([item for sublist in get_photos_list for item in sublist], columns=['Data', 'Username', 'Likes', 'Comments', 'Title Photo', 'Location', 'Latitude', 'Longitude', 'URL'])
print(photos)
This will output the photos information:
Data | Username | Likes | Comments | Title Photo | Location | Latitude | Longitude | URL |
---|---|---|---|---|---|---|---|---|
2021-05-31 | lpspeggy31 | -1 | -1 | - | New York | -57.6078 | -122.4249 | https://… |
2021-06-01 | lpspeggy31 | -1 | -1 | New York | -57.6078 | -122.4249 | https://… | |
henrydavis32 | 1 | -1 | San Francisco | -37.7749 | -122.4194 | https://… | ||
mikeburr0ws | -1 | -1 | Chicago | -41.8781 | -87.6298 | https://… |
Creating the Final DataFrame
Now, let’s create a function to append all three dataframes:
def create_final_dataframe():
# Create a new dataframe with the player information
player_df = pd.DataFrame(players, columns=['username', 'userId'])
# Append followers information to the player dataframe
follower_df = pd.DataFrame([item for sublist in get_followers_list for item in sublist], columns=['userID', 'Full Name', 'username', 'Profile Picture', 'Type', 'following_username', 'following_userid'])
# Concatenate dataframes on username
new_player_df = pd.concat([player_df,follower_df]).drop_duplicates('username')
# Append photos information to the final dataframe
photo_df = pd.DataFrame([item for sublist in get_photos_list for item in sublist], columns=['Data', 'Username', 'Likes', 'Comments', 'Title Photo', 'Location', 'Latitude', 'Longitude', 'URL'])
# Concatenate dataframes on username and photos by data
new_final_df = pd.concat([new_player_df,photo_df]).drop_duplicates(['username','Data'])
return new_final_df
# Create the final dataframe
final_dataframe = create_final_dataframe()
print(final_dataframe)
This will output the final dataframe:
| username | Full Name| Data | Likes | Comments| Title Photo| Location| Latitude| Longitude| URL| |———–:|————:|:——-:|——-:|——–:|————:|——– :|———:|———-:|——————| | lpspeggy31 |full_name_0 |2021-05-31|-1 |-1 |- |New York|-57.6078|-122.4249|https://…| | henrydavis32|full_name_1 |2021-06-01| 1 |-1 | |San Francisco|-37.7749|-122.4194|https://…| | mikeburr0ws|mikeburr0ws|-2021-07-02|-1 |-1 | |Chicago |-41.8781|-87.6298|https://…|
The final dataframe contains all the information for each player, their followers, and the photos taken by those followers.
Example Use Cases
This function can be used in various scenarios where you need to fetch and combine data from multiple sources. Some examples include:
- Creating a social media analytics dashboard
- Building a recommendation system based on user behavior
- Integrating multiple APIs into one application
Last modified on 2024-08-18