Библиотеки¶
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from pprint import pprint
TODO¶
- Подписи к графикам
- Переделать все под
pd.DataFrame(?) - Поправить заголовки
Данные¶
Payments¶
In [2]:
payments = pd.read_csv("data/payments.csv")
payments.head()
Out[2]:
| payment_date | Payment_types | real_cost | account_id | |
|---|---|---|---|---|
| 0 | 2021-01-01 00:00:56 | Payment_system_5 | 9.650000 | 2907221 |
| 1 | 2021-01-01 00:01:48 | Payment_system_5 | 0.955872 | 3228373 |
| 2 | 2021-01-01 00:03:42 | Payment_system_5 | 3.026930 | 318552 |
| 3 | 2021-01-01 00:04:27 | Payment_system_2 | 99.990000 | 3832817 |
| 4 | 2021-01-01 00:06:41 | Payment_system_5 | 1.432550 | 7229767 |
Persents¶
In [3]:
persents = pd.read_csv("data/persents.csv", index_col=0)
persents.head()
Out[3]:
| Payment_types | Share, % | |
|---|---|---|
| 0 | Payment_system_1 | 20.7 |
| 1 | Payment_system_2 | 15.0 |
| 2 | Payment_system_3 | 58.5 |
| 3 | Payment_system_4 | 0.0 |
| 4 | Payment_system_5 | 24.0 |
Registrations¶
In [4]:
registrations = pd.read_csv("data/registrations.csv", index_col=0)#.dropna()
registrations.head()
Out[4]:
| account_id | created_date | campaign | |
|---|---|---|---|
| 0 | 3842380.0 | 2021-01-01 | BRA_MS1_install |
| 1 | 3842381.0 | 2021-01-01 | NaN |
| 2 | 3842382.0 | 2021-01-01 | NaN |
| 3 | 3842383.0 | 2021-01-01 | NaN |
| 4 | 3842384.0 | 2021-01-01 | BRA_MS1_install |
Budget¶
In [5]:
budget = pd.read_excel("data/Рекламный бюджет январь 2021.xlsx")
budget.loc[0:5, 'media_source'] = 'Media_source_1'
budget.loc[6:7, 'media_source'] = 'Media_source_2'
budget
Out[5]:
| media_source | Campaign_type | Target | Plan, USD | Installs | Spend, USD | Deviation, USD | |
|---|---|---|---|---|---|---|---|
| 0 | Media_source_1 | install | Brazil | 100 | 10460 | 99.12 | 0.88 |
| 1 | Media_source_1 | purchase | Brazil | 400 | 1081 | 398.67 | 1.33 |
| 2 | Media_source_1 | install | Russia | 200 | 11894 | 246.30 | -46.30 |
| 3 | Media_source_1 | purchase | Russia | 500 | 1441 | 616.77 | -116.77 |
| 4 | Media_source_1 | install | Ukrane | 100 | 6424 | 99.43 | 0.57 |
| 5 | Media_source_1 | purchase | Ukrane | 500 | 242 | 348.13 | 151.87 |
| 6 | Media_source_2 | install | Brazil | 100 | 152 | 81.15 | 18.85 |
| 7 | Media_source_2 | install | English_speaking | 100 | 371 | 99.95 | 0.05 |
| 8 | Total | NaN | NaN | 2000 | 32065 | 1989.52 | 10.48 |
Analisys¶
Оптимальный срок окупаемости¶
$$\Large T = \frac{IC}{FV} $$
где:
- $\large T$ — срок окупаемости
- $\large IC$ — инвестиционные расходы
- $\large FV$ — прибыль
In [6]:
df = payments.join(persents.set_index("Payment_types"), on="Payment_types")
df['cost'] = df['real_cost'] * (100 - df['Share, %']) / 100
df = df.join(registrations.set_index('account_id'), on="account_id").dropna()
df
Out[6]:
| payment_date | Payment_types | real_cost | account_id | Share, % | cost | created_date | campaign | |
|---|---|---|---|---|---|---|---|---|
| 330 | 2021-01-01 15:10:26 | Payment_system_5 | 2.445710 | 3861811 | 24.0 | 1.858740 | 2021-01-07 | RUS_MS1_purchase |
| 339 | 2021-01-01 15:22:00 | Payment_system_5 | 2.445710 | 3861811 | 24.0 | 1.858740 | 2021-01-07 | RUS_MS1_purchase |
| 827 | 2021-01-01 17:16:09 | Payment_system_5 | 9.624570 | 3844647 | 24.0 | 7.314673 | 2021-01-01 | BRA_MS1_install |
| 837 | 2021-01-01 17:17:45 | Payment_system_5 | 9.624570 | 3844647 | 24.0 | 7.314673 | 2021-01-01 | BRA_MS1_install |
| 1922 | 2021-01-02 01:08:32 | Payment_system_5 | 0.955872 | 3845613 | 24.0 | 0.726463 | 2021-01-02 | RUS_MS1_purchase |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 301124 | 2021-12-29 17:14:22 | Payment_system_4 | 2.620000 | 3924840 | 0.0 | 2.620000 | 2021-01-29 | RUS_MS1_install |
| 301169 | 2021-12-29 18:55:43 | Payment_system_4 | 0.100000 | 3924840 | 0.0 | 0.100000 | 2021-01-29 | RUS_MS1_install |
| 301243 | 2021-12-29 21:22:10 | Payment_system_4 | 0.310000 | 3924840 | 0.0 | 0.310000 | 2021-01-29 | RUS_MS1_install |
| 301471 | 2021-12-30 08:15:58 | Payment_system_4 | 0.380000 | 3924840 | 0.0 | 0.380000 | 2021-01-29 | RUS_MS1_install |
| 303332 | 2021-12-31 20:58:48 | Payment_system_5 | 2.309120 | 3861811 | 24.0 | 1.754931 | 2021-01-07 | RUS_MS1_purchase |
711 rows × 8 columns
In [7]:
d = {
("Media_source_1", "install", "Brazil"): "BRA_MS1_install",
("Media_source_1", "purchase", "Brazil"): "BRA_MS1_purchase",
("Media_source_1", "install", "Russia"): "RUS_MS1_install",
("Media_source_1", "purchase", "Russia"): "RUS_MS1_purchase",
("Media_source_1", "install", "Ukrane"): "UKR_MS1_install",
("Media_source_1", "purchase", "Ukrane"): "UKR_MS1_purchase",
("Media_source_2", "install", "Brazil"): "BRA_MS2_install",
("Media_source_2", "install", "English_speaking"): "ENG_MS2_install",
}
In [8]:
T = []
for l in budget.iloc:
source, campaign_type, target, *_, spend, _ = l
if source == 'Total':
break
cmp = d[source, campaign_type, target]
tmp_df = df[df['campaign'] == cmp]
T.append((source, campaign_type, target, spend / tmp_df['cost'].sum()))
In [29]:
T_df = pd.DataFrame(T, columns=["Media Source", 'Campaign type', 'Target', "Time, (months)"])
T_df = T_df.set_index(["Media Source", 'Campaign type', 'Target'])
T_df.to_csv('tables/payback-period.csv')
T_df.to_markdown('tables/payback-period.md')
In [10]:
fig, ax = plt.subplots(1, 1, figsize=(8,6), dpi=160)
ax.grid()
# ax.set_xlim(1, 70)
ax.set(xlabel='Срок окупаемости, (месяцы)',
ylabel='Рекламная компания',
title='Оптимальный срок окупаемости')
plt.xticks(rotation='vertical')
sns.barplot(ax=ax, data=T_df, x="Time, (months)", hue=T_df.index)
plt.legend(title="");
plt.savefig("pics/payback-period.png")
Определить наиболее успешные кейсы¶
Рекламные кампании¶
In [30]:
T_sorted_df = T_df.sort_values(by='Time, (months)')
T_sorted_df.to_csv('tables/payback-period.sorted.csv')
T_sorted_df.to_markdown('tables/payback-period.sorted.md')
In [12]:
fig, ax = plt.subplots(1, 1, figsize=(8,6), dpi=160)
ax.grid()
ax.set(xlabel='Срок окупаемости, месяцы', ylabel='Рекламная компания', title='Оптимальный срок окупаемости')
# plt.xticks(rotation='vertical')
sns.barplot(ax=ax, data=T_sorted_df, x="Time, (months)", hue=T_df.index)
plt.legend(title="");
plt.savefig("pics/payback-period.sorted.png")
Тип рекламных кампаний¶
In [13]:
fig, ax = plt.subplots(1, 1, figsize=(4,6), dpi=160)
ax.grid()
ax.set(title='Оптимальный срок окупаемости, \nсгруппированный по типам рекламных кампаний')
sns.boxplot(ax=ax, data=T_sorted_df, y='Time, (months)', hue="Campaign type");
plt.savefig("pics/payback-period.ct.png")
Рекламная сеть¶
In [14]:
fig, ax = plt.subplots(1, 1, figsize=(4,6), dpi=160)
ax.grid()
ax.set(title='Оптимальный срок окупаемости, \nсгруппированный по рекламным сетям')
sns.boxplot(ax=ax, data=T_sorted_df, y='Time, (months)', hue="Media Source");
plt.savefig("pics/payback-period.ms.png")
Накопительный ARPU успешных рекламных кампаний¶
$$\Large ARPU = \frac{\text{Доход}}{\text{Число пользователей}} $$
In [15]:
from datetime import date, datetime, timedelta
In [16]:
df = registrations.dropna()
df = payments.join(df.set_index("account_id"), on="account_id").dropna()
df = df.join(persents.set_index("Payment_types"), on="Payment_types")
df['cost'] = df['real_cost'] * (100 - df['Share, %']) / 100
df['payment_date'] = pd.to_datetime(df['payment_date'], format="%Y-%m-%d %H:%M:%S").dt.date
df['created_date'] = pd.to_datetime(df['created_date'], format="%Y-%m-%d").dt.date
df = df[df['campaign'].str.contains("purchase")]
table = df[["account_id", "created_date", "payment_date", "cost", "campaign"]]
table
Out[16]:
| account_id | created_date | payment_date | cost | campaign | |
|---|---|---|---|---|---|
| 330 | 3861811 | 2021-01-07 | 2021-01-01 | 1.858740 | RUS_MS1_purchase |
| 339 | 3861811 | 2021-01-07 | 2021-01-01 | 1.858740 | RUS_MS1_purchase |
| 1922 | 3845613 | 2021-01-02 | 2021-01-02 | 0.726463 | RUS_MS1_purchase |
| 2920 | 3843913 | 2021-01-01 | 2021-01-02 | 0.726463 | RUS_MS1_purchase |
| 4335 | 3908313 | 2021-01-22 | 2021-01-03 | 4.237699 | UKR_MS1_purchase |
| ... | ... | ... | ... | ... | ... |
| 290371 | 3864062 | 2021-01-07 | 2021-12-17 | 0.619811 | RUS_MS1_purchase |
| 295786 | 3908063 | 2021-01-22 | 2021-12-24 | 23.013104 | UKR_MS1_purchase |
| 296561 | 3908313 | 2021-01-22 | 2021-12-25 | 36.344492 | UKR_MS1_purchase |
| 298659 | 3908063 | 2021-01-22 | 2021-12-26 | 23.013104 | UKR_MS1_purchase |
| 303332 | 3861811 | 2021-01-07 | 2021-12-31 | 1.754931 | RUS_MS1_purchase |
325 rows × 5 columns
In [17]:
campaigns = list(table['campaign'].unique())
accounts = list(table['account_id'].unique())
payments = list(table['payment_date'].unique())
In [18]:
result_table = table.groupby(by=["campaign","account_id","created_date","payment_date"], as_index=False).sum()
In [19]:
dates = [7,14,30]
dates = [i for i in range(1, 32)]
ARPUs = np.zeros((len(campaigns), len(dates)))
In [20]:
for i, c in enumerate(campaigns):
cond1 = result_table['campaign'] == c
num_accounts = len(result_table[cond1])
for acc in accounts:
cond2 = result_table['account_id'] == acc
cond = np.logical_and(cond1, cond2)
tmp_table = result_table[cond]
s = np.zeros(len(dates))
for j,d in enumerate(dates):
cond = tmp_table['payment_date'] <= tmp_table['created_date'] + timedelta(days=d)
s[j] += tmp_table[cond]['cost'].sum()
# print(s)
ARPUs[i] += s
ARPUs[i] /= num_accounts
# print(ARPUs)
In [31]:
df_ARPU = pd.DataFrame(ARPUs.T, columns=campaigns)
# df_ARPU
df_ARPU['days'] = list(map(lambda x: f"{x} days" if x > 1 else f"{x} day", dates))
df_ARPU_melted = df_ARPU.melt('days', var_name='Campaign type', value_name='ARPU')
df_ARPU.iloc[[6, 13, 29]].to_csv('tables/arpu.csv')
df_ARPU.iloc[[6, 13, 29]].to_markdown('tables/arpu.md')
In [22]:
fig, ax = plt.subplots(1, 1, figsize=(8,6), dpi=160)
ax.grid()
# ax.set_xlim(1, 70)
ax.set(xlabel='Days', ylabel='Average revenue per user')
ax.set(title='ARPU, \nсгруппированный по рекламным компаниям')
# for i, c in enumerate(campaigns):
plt.xticks(rotation=70)
sns.lineplot(ax=ax, data=df_ARPU_melted, x='days', y='ARPU', hue='Campaign type');
plt.savefig("pics/arpu.png")
Фактический CPI (cost per install)¶
$$\Large CPI = \frac{\text{Количество установок}}{\text{Бюджет}} $$
In [23]:
d = {
("Media_source_1", "install", "Brazil"): "BRA_MS1_install",
("Media_source_1", "purchase", "Brazil"): "BRA_MS1_purchase",
("Media_source_1", "install", "Russia"): "RUS_MS1_install",
("Media_source_1", "purchase", "Russia"): "RUS_MS1_purchase",
("Media_source_1", "install", "Ukrane"): "UKR_MS1_install",
("Media_source_1", "purchase", "Ukrane"): "UKR_MS1_purchase",
("Media_source_2", "install", "Brazil"): "BRA_MS2_install",
("Media_source_2", "install", "English_speaking"): "ENG_MS2_install",
}
In [24]:
CPI = list()
tmp_budget = budget.groupby(by=["media_source", "Campaign_type",'Target']).sum()
for (MS, Type, Target), c in d.items():
cond = registrations['campaign'] == c
num_installs = cond.sum()
b = tmp_budget.loc[MS, Type, Target]['Spend, USD']
CPI.append((MS, Type, Target, b / num_installs))
In [32]:
df = pd.DataFrame(CPI, columns=["Media Source", 'Campaign type', 'Target', "CPI"])
df = df.set_index(["Media Source", 'Campaign type', 'Target'])
df = df.sort_values(by='CPI')
df.to_csv('tables/cpi.sorted.csv')
df.to_markdown('tables/cpi.sorted.md')
In [26]:
fig, ax = plt.subplots(1, 1, figsize=(8,6), dpi=160)
ax.grid()
ax.set(title='Фактический CPI')
sns.barplot(ax=ax, data=df, x='CPI', hue=df.index)
plt.legend(title="");
plt.savefig("pics/cpi.sorted.png")
In [27]:
compare_budget = budget.dropna().rename(
{
'media_source':"Media Source",
'Campaign_type':'Campaign type',
}, axis=1).set_index(["Media Source", 'Campaign type', 'Target'])[['Installs','Spend, USD']]
compare_df = df.join(compare_budget)
compare_df['Plan CPI'] = compare_df['Spend, USD'] / compare_df['Installs']
compare_df['Deviation'] = abs(compare_df['CPI'] - compare_df['Plan CPI'])
compare_df
Out[27]:
| CPI | Installs | Spend, USD | Plan CPI | Deviation | |||
|---|---|---|---|---|---|---|---|
| Media Source | Campaign type | Target | |||||
| Media_source_1 | install | Brazil | 0.008648 | 10460 | 99.12 | 0.009476 | 0.000828 |
| Ukrane | 0.014644 | 6424 | 99.43 | 0.015478 | 0.000834 | ||
| Russia | 0.019472 | 11894 | 246.30 | 0.020708 | 0.001236 | ||
| Media_source_2 | install | English_speaking | 0.245577 | 371 | 99.95 | 0.269407 | 0.023830 |
| Media_source_1 | purchase | Brazil | 0.350018 | 1081 | 398.67 | 0.368797 | 0.018780 |
| Russia | 0.390114 | 1441 | 616.77 | 0.428015 | 0.037901 | ||
| Media_source_2 | install | Brazil | 0.533882 | 152 | 81.15 | 0.533882 | 0.000000 |
| Media_source_1 | purchase | Ukrane | 1.338962 | 242 | 348.13 | 1.438554 | 0.099592 |
In [28]:
fig, ax = plt.subplots(1, 1, figsize=(8,6), dpi=160)
ax.grid()
ax.set(title='Отклонения фактического CPI от бюджета')
sns.barplot(ax=ax, data=compare_df, y='Deviation', hue=df.index)
plt.legend(title="");
plt.savefig("pics/cpi.deviation.png")