# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/databbva2023/digital.csv /kaggle/input/databbva2023/Diccionario de datos.xlsx /kaggle/input/databbva2023/archive/sample_submission.csv /kaggle/input/databbva2023/archive/balances.csv /kaggle/input/databbva2023/archive/universe_train.csv /kaggle/input/databbva2023/archive/movements.csv /kaggle/input/databbva2023/archive/liabilities.csv /kaggle/input/databbva2023/archive/universe_test.csv /kaggle/input/databbva2023/archive/customers.csv
Crear un dataset con las 5 fuentes. 5 variables por fuente, si es individual. 10 variables por fuente, si es de 2. Crear las variables usando DuckDB en SQL en Kaggle, no olvidar vincular su numero de telefono a su cuenta kaggle
%%time
path0 = '/kaggle/input/databbva2023/archive/'
train = pd.read_csv(f'{path0}universe_train.csv')
test = pd.read_csv(f'{path0}universe_test.csv')
test['attrition'] = -1
universe = pd.concat([train , test[train.columns]])#[['ID', 'period']]
print(universe.period.value_counts(1).sort_index())
print(train.shape, test.shape, universe.shape)
#universe = universe.sample(frac = .2, random_state = 123)
print(train.shape, test.shape, universe.shape)
universe_path = 'universe.csv' #tabla
universe.to_csv( universe_path , sep =',', index = False)
universe.head()
period period_1 0.106263 period_2 0.107940 period_3 0.109931 period_4 0.112305 period_5 0.113070 period_6 0.112493 period_7 0.111557 period_8 0.111922 period_9 0.114519 Name: proportion, dtype: float64 (535943, 3) (273636, 3) (809579, 3) (535943, 3) (273636, 3) (809579, 3) CPU times: user 6.28 s, sys: 326 ms, total: 6.6 s Wall time: 6.64 s
ID | attrition | period | |
---|---|---|---|
0 | 4d5a33701cb5b30b0b0a924d80de4ae78fbd0b54e2117f... | 0 | period_5 |
1 | bf7092e5f394d266143dfe90b3fc73eac51f0b0084d7a3... | 0 | period_5 |
2 | 270d8f8f607d19886c50edb7746c3670194134b56d31e2... | 0 | period_5 |
3 | 5e6b7bea5e4911329669f45728d3398ad54dfe11fbb16d... | 0 | period_5 |
4 | 98954adf775b9fce1c9e311a025ec3e0a1c6e90f991ef7... | 0 | period_5 |
! pip install duckdb=='0.9.2'
import duckdb
Collecting duckdb==0.9.2 Obtaining dependency information for duckdb==0.9.2 from https://files.pythonhosted.org/packages/7c/d5/7688cdc1b57c9ada17b6f6143a923606294b462e4b15e3ac249c04649bde/duckdb-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata Downloading duckdb-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (760 bytes) Downloading duckdb-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.5 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 16.5/16.5 MB 43.8 MB/s eta 0:00:00:00:0100:01 Installing collected packages: duckdb Successfully installed duckdb-0.9.2
#tablas
digital = '/kaggle/input/databbva2023/digital.csv'
sample_submission = '/kaggle/input/databbva2023/archive/sample_submission.csv'
balances='/kaggle/input/databbva2023/archive/balances.csv'
universe_train = '/kaggle/input/databbva2023/archive/universe_train.csv'
movements = '/kaggle/input/databbva2023/archive/movements.csv'
liabilities = '/kaggle/input/databbva2023/archive/liabilities.csv'
universe_test = '/kaggle/input/databbva2023/archive/universe_test.csv'
customers ='/kaggle/input/databbva2023/archive/customers.csv'
universe_path = 'universe.csv'
!head {universe_path} -n 10
ID,attrition,period 4d5a33701cb5b30b0b0a924d80de4ae78fbd0b54e2117fc9fa8a18b6c03a7463,0,period_5 bf7092e5f394d266143dfe90b3fc73eac51f0b0084d7a350a66bc5886a3a2735,0,period_5 270d8f8f607d19886c50edb7746c3670194134b56d31e2389b95baa046e8e81a,0,period_5 5e6b7bea5e4911329669f45728d3398ad54dfe11fbb16d1f6c77968679cfa4a9,0,period_5 98954adf775b9fce1c9e311a025ec3e0a1c6e90f991ef7f26130b02106f2326c,0,period_5 56fe8d95f1af9103e92ce450ece54b4647ccb001a866e61fb1754a8be9b66f8b,1,period_5 f728012da429bb829188103c2cb56ef496a7b7e27360fd992aa3a3686735b2e9,0,period_5 9bcc48234dd33f97a9e5d475d39fcb4d562bd7489404d1aa3521b0178fc44ef6,0,period_5 d8e3721b57230439133fd302f51312c6872eec574877a9b83b026059694cc44b,0,period_5
%%time
#
# 'movements_sum_type_4_9'
#movements_std_type_1_12
tabla = movements
historia = 9
var1 = 'type_4'
alias = 'movements'
oper = 'sum' #'stddev_pop'
periodo = 'period_9'
query2 = f""" with
PPP as (select ID, period from "{universe_path}"
where period = '{periodo}')
select * from PPP
"""
print(query2)
df1 = duckdb.query(query2).to_df()
df1
#print(df1.head())
#df1.describe()
with PPP as (select ID, period from "universe.csv" where period = 'period_9') select * from PPP CPU times: user 1.7 s, sys: 124 ms, total: 1.83 s Wall time: 688 ms
ID | period | |
---|---|---|
0 | 20b427c65c62ee0177adfa300bbf9f93ba2e46c86d5bc7... | period_9 |
1 | 4881d4a2146002a02279bedb6f0a844af022b4dd1d8b1b... | period_9 |
2 | 28ff12351f215091b54166a09ca44858f3d7f6245f281c... | period_9 |
3 | 5cd2c0dc92c1c5ffa83a94f15756cef5c58174e9d8c69a... | period_9 |
4 | ceebdd2bbaafd83d8c205e8071b79da5338e9db94c9509... | period_9 |
... | ... | ... |
92707 | d6d86ea71a4676f331f9f23db1dde8ec33ea5c6ddf18b5... | period_9 |
92708 | f5ee95d9d6622b80acc0992b38f1fdba8dd8e9c727b1fb... | period_9 |
92709 | e1db2bad3009155d1abc5f7643cff1e97266a4149655c2... | period_9 |
92710 | ea0f8cd50187e128c88dacc20ea47938c44160f42f54b3... | period_9 |
92711 | 692519305cc796db455f26fc93368a4f802bae16fbbcef... | period_9 |
92712 rows × 2 columns
%%time
#
# 'movements_sum_type_4_9'
#movements_std_type_1_12
tabla = movements
historia = 9
var1 = 'type_4'
alias = 'movements'
periodo = 'period_1'
oper = 'sum' #'stddev_pop'
query2 = f""" with tabla_h1 as (
select ID, period,
{oper}( case when month>=12-{historia} + 1 then {var1} end ) as {alias}_{oper}_{var1}_{historia}m
from '{tabla}'
where period = '{periodo}'
group by 1, 2 ),
tabla_lag as ( select A.ID,A.month,A.period, A.type_4, B.type_4 as type_4_lag,
A.type_4 - B.type_4 as dif_type_4, A.type_4/B.type_4 as rat_type_4
from '{tabla}' A
left join '{tabla}' B
on A.month = B.month + 1
and A.ID == B.ID
where A.period = '{periodo}')
select * from tabla_lag
where ID like '%aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6%'
order by month
"""
print(query2)
df1 = duckdb.query(query2).to_df()
df1
#df1.describe()
with tabla_h1 as ( select ID, period, sum( case when month>=12-9 + 1 then type_4 end ) as movements_sum_type_4_9m from '/kaggle/input/databbva2023/archive/movements.csv' where period = 'period_1' group by 1, 2 ), tabla_lag as ( select A.ID,A.month,A.period, A.type_4, B.type_4 as type_4_lag, A.type_4 - B.type_4 as dif_type_4, A.type_4/B.type_4 as rat_type_4 from '/kaggle/input/databbva2023/archive/movements.csv' A left join '/kaggle/input/databbva2023/archive/movements.csv' B on A.month = B.month + 1 and A.ID == B.ID where A.period = 'period_1') select * from tabla_lag where ID like '%aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6%' order by month
CPU times: user 22.4 s, sys: 1.22 s, total: 23.6 s Wall time: 6.86 s
ID | month | period | type_4 | type_4_lag | dif_type_4 | rat_type_4 | |
---|---|---|---|---|---|---|---|
0 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 1 | period_1 | 3.2422 | NaN | NaN | NaN |
1 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 2 | period_1 | 3.6018 | 3.2422 | 0.3596 | 1.110912 |
2 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 3 | period_1 | 3.2380 | 3.6018 | -0.3638 | 0.898995 |
3 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 4 | period_1 | 0.0000 | 3.2380 | -3.2380 | 0.000000 |
4 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 5 | period_1 | 0.0000 | 0.0000 | 0.0000 | NaN |
5 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 6 | period_1 | 2.9748 | 0.0000 | 2.9748 | NaN |
6 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 7 | period_1 | 2.3521 | 2.9748 | -0.6227 | 0.790675 |
7 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 8 | period_1 | 3.6789 | 2.3521 | 1.3268 | 1.564092 |
8 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 9 | period_1 | 3.3151 | 3.6789 | -0.3638 | 0.901112 |
9 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 10 | period_1 | 0.0000 | 3.3151 | -3.3151 | 0.000000 |
10 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 11 | period_1 | 3.4372 | 0.0000 | 3.4372 | NaN |
11 | aa1e422b92b28313b2cb5705345305c1dec0bb10a65b6b... | 12 | period_1 | 0.0000 | 3.4372 | -3.4372 | 0.000000 |
%%time
#
# 'movements_sum_type_4_9'
#movements_std_type_1_12
tabla = movements
historia = 9
var1 = 'type_4'
alias = 'movements'
periodo = 'period_1'
oper = 'sum' #'stddev_pop'
query2 = f""" with
PO as (select ID, period from '{universe_path}'
where period = '{periodo}'),
tabla_lag as ( select A.ID,A.month,A.period, A.type_4, B.type_4 as type_4_lag,
A.type_4 - B.type_4 as dif_type_4, A.type_4/B.type_4 as rat_type_4
from '{tabla}' A
left join '{tabla}' B
on A.month = B.month + 1
and A.ID == B.ID
where A.period = '{periodo}'),
tabla_h1 as (
select ID, period,
{oper}( case when month>=12-{historia} + 1 then type_4 end ) as {alias}_{oper}_type_4_{historia}m,
{oper}( case when month>=12-{historia} + 1 then dif_type_4 end ) as {alias}_{oper}_dif_type_4_{historia}m,
{oper}( case when month>=12-{historia} + 1 then rat_type_4 end ) as {alias}_{oper}_rat_type_4_{historia}m
from tabla_lag
group by 1, 2
)
select PO.ID, PO.period,
tabla_h1.{alias}_{oper}_{var1}_{historia}m ,
tabla_h1.{alias}_{oper}_dif_type_4_{historia}m,
tabla_h1.{alias}_{oper}_rat_type_4_{historia}m
from PO left join tabla_h1
on PO.ID = tabla_h1.ID
and PO.period = tabla_h1.period
"""
print(query2)
df1 = duckdb.query(query2).to_df()
print(df1.describe())
df1
with PO as (select ID, period from 'universe.csv' where period = 'period_1'), tabla_lag as ( select A.ID,A.month,A.period, A.type_4, B.type_4 as type_4_lag, A.type_4 - B.type_4 as dif_type_4, A.type_4/B.type_4 as rat_type_4 from '/kaggle/input/databbva2023/archive/movements.csv' A left join '/kaggle/input/databbva2023/archive/movements.csv' B on A.month = B.month + 1 and A.ID == B.ID where A.period = 'period_1'), tabla_h1 as ( select ID, period, sum( case when month>=12-9 + 1 then type_4 end ) as movements_sum_type_4_9m, sum( case when month>=12-9 + 1 then dif_type_4 end ) as movements_sum_dif_type_4_9m, sum( case when month>=12-9 + 1 then rat_type_4 end ) as movements_sum_rat_type_4_9m from tabla_lag group by 1, 2 ) select PO.ID, PO.period, tabla_h1.movements_sum_type_4_9m , tabla_h1.movements_sum_dif_type_4_9m, tabla_h1.movements_sum_rat_type_4_9m from PO left join tabla_h1 on PO.ID = tabla_h1.ID and PO.period = tabla_h1.period
movements_sum_type_4_9m movements_sum_dif_type_4_9m \ count 53812.000000 43327.000000 mean 8.689610 0.093409 std 11.992447 1.900045 min 0.000000 -13.343400 25% 0.000000 0.000000 50% 3.077300 0.000000 75% 13.525000 0.265150 max 45.000000 13.889900 movements_sum_rat_type_4_9m count 26155.000000 mean 3.601818 std 3.517176 min 0.000000 25% 0.470228 50% 2.436791 75% 6.737247 max 139.179389 CPU times: user 24.4 s, sys: 1.15 s, total: 25.5 s Wall time: 8.05 s
ID | period | movements_sum_type_4_9m | movements_sum_dif_type_4_9m | movements_sum_rat_type_4_9m | |
---|---|---|---|---|---|
0 | 8d52c4d1c816105c94baafce541821d7ae2ce4f8345ace... | period_1 | 10.0068 | 0.0000 | 0.691783 |
1 | c7758875382acc4047bf25b26c23bd6bfc5f4999bc4133... | period_1 | 4.6371 | -2.9060 | 0.000000 |
2 | c6c09ad76b795df083f6a6a1b683b5da90698335a04863... | period_1 | 22.3868 | -0.3029 | 2.548814 |
3 | 2042cb241b66cb000d7674f853de8c7ff26135d016fc95... | period_1 | 13.4925 | -0.4203 | 2.070439 |
4 | 343937c457f4e6935b4711ea386963939670afc8df4a05... | period_1 | 0.0000 | 0.0000 | NaN |
... | ... | ... | ... | ... | ... |
86023 | d1a13d63315cb0e70fd7d57cf8a70c9b950f9e93827ba5... | period_1 | NaN | NaN | NaN |
86024 | a25a6b000c95a427c785ec38cc4df08d0203195956c6bf... | period_1 | NaN | NaN | NaN |
86025 | 56ea360572406776e0a2c09dd45f423e60758b1229acd6... | period_1 | NaN | NaN | NaN |
86026 | 8a1f96cf0c387b1afffe93c79d81b1f9deeffd73dce23c... | period_1 | NaN | NaN | NaN |
86027 | 9e31f6bbee66f89acf0b2fb8d737276cb5c50705c9ca38... | period_1 | NaN | NaN | NaN |
86028 rows × 5 columns
%%time
#
# 'movements_sum_type_4_9'
#movements_std_type_1_12
tabla = balances
historia = 9
var1 = 'type_4'
alias = 'movements'
oper = 'sum' #'stddev_pop'
periodo = 'period_1'
query2 = f""" with
PO as (select ID, period from '{universe_path}'
where period = '{periodo}'),
balance1 as ( select ID, period, month,entity,
case
when ( type='type_3' and product = 'product_1') then 't3p1'
when ( type='type_3' and product = 'product_2') then 't3p2'
when ( type='type_5' and product = 'product_6') then 't5p6'
when ( type='type_7' and product = 'product_2') then 't7p2'
when ( type='type_7' and product = 'product_4') then 't7p4'
when ( type='type_4' and product = 'product_2') then 't4p2'
when ( type='type_4' and product = 'product_4') then 't4p4'
else 't12347_p24357'
end as type_prod,
balance_amount, days_default
from '{tabla}'
where period = '{periodo}'
)
select balance1.*
from PO inner join balance1
on PO.ID = balance1.ID
and PO.period = balance1.period
limit 10
"""
print(query2)
df1 = duckdb.query(query2).to_df()
df1.head()
with PO as (select ID, period from 'universe.csv' where period = 'period_1'), balance1 as ( select ID, period, month,entity, case when ( type='type_3' and product = 'product_1') then 't3p1' when ( type='type_3' and product = 'product_2') then 't3p2' when ( type='type_5' and product = 'product_6') then 't5p6' when ( type='type_7' and product = 'product_2') then 't7p2' when ( type='type_7' and product = 'product_4') then 't7p4' when ( type='type_4' and product = 'product_2') then 't4p2' when ( type='type_4' and product = 'product_4') then 't4p4' else 't12347_p24357' end as type_prod, balance_amount, days_default from '/kaggle/input/databbva2023/archive/balances.csv' where period = 'period_1' ) select balance1.* from PO inner join balance1 on PO.ID = balance1.ID and PO.period = balance1.period limit 10 CPU times: user 5.38 s, sys: 382 ms, total: 5.76 s Wall time: 1.97 s
ID | period | month | entity | type_prod | balance_amount | days_default | |
---|---|---|---|---|---|---|---|
0 | 4b7b103203c6c137a485d70620395f8fbff0794cef72a4... | period_1 | 8 | entity_1 | t3p2 | 4.3867 | 0.0 |
1 | d478e7dd268be87ab1558b61922fa7cc444b2f701c2020... | period_1 | 10 | entity_3 | t3p2 | 5.0000 | 0.0 |
2 | 39ea65e8f41c576650aa2c505c7ad6c3d11e2c5d7d0291... | period_1 | 4 | entity_5 | t3p2 | 4.5857 | 0.0 |
3 | d9dfb84ce1d0cb0f4b0661deaad48a4b3640170eed550b... | period_1 | 3 | entity_3 | t3p2 | 4.0060 | 0.0 |
4 | cf1753c60704f6000caca60845e98425805a23bca0115a... | period_1 | 1 | entity_5 | t3p2 | 4.8710 | 0.0 |
%%time
#
# 'movements_sum_type_4_9'
#movements_std_type_1_12
tabla = balances
historia = 9
var1 = 'type_4'
alias = 'movements'
oper = 'sum' #'stddev_pop'
periodo = 'period_1'
query2 = f""" with
PO as (select ID, period from '{universe_path}'
where period = '{periodo}'),
balance1 as ( select ID, period, month,
case
when ( type='type_3' and product = 'product_1') then 't3p1'
when ( type='type_3' and product = 'product_2') then 't3p2'
when ( type='type_5' and product = 'product_6') then 't5p6'
when ( type='type_7' and product = 'product_2') then 't7p2'
when ( type='type_7' and product = 'product_4') then 't7p4'
when ( type='type_4' and product = 'product_2') then 't4p2'
when ( type='type_4' and product = 'product_4') then 't4p4'
else 't12347_p24357'
end as type_prod,
sum( balance_amount) as type_prod_balance_amount , max(days_default) as type_prod_days_default
from '{tabla}'
where period = '{periodo}'
group by 1, 2,3, 4 ),
balance2 as ( select ID, period, month, entity,
sum( balance_amount) as entity_balance_amount , max(days_default) as entity_days_default
from '{tabla}'
where period = '{periodo}'
group by 1, 2,3, 4 ),
balance_tpe as (select balance1.* , balance2.entity, balance2.entity_balance_amount, balance2.entity_days_default
from balance1
inner join balance2
on balance1.ID = balance2.ID
and balance1.period = balance2.period
and balance1.month = balance2.month)
select balance_tpe.*
from PO inner join balance_tpe
on PO.ID = balance_tpe.ID
and PO.period = balance_tpe.period
limit 10
"""
print(query2)
df1 = duckdb.query(query2).to_df()
df1.head()
with PO as (select ID, period from 'universe.csv' where period = 'period_1'), balance1 as ( select ID, period, month, case when ( type='type_3' and product = 'product_1') then 't3p1' when ( type='type_3' and product = 'product_2') then 't3p2' when ( type='type_5' and product = 'product_6') then 't5p6' when ( type='type_7' and product = 'product_2') then 't7p2' when ( type='type_7' and product = 'product_4') then 't7p4' when ( type='type_4' and product = 'product_2') then 't4p2' when ( type='type_4' and product = 'product_4') then 't4p4' else 't12347_p24357' end as type_prod, sum( balance_amount) as type_prod_balance_amount , max(days_default) as type_prod_days_default from '/kaggle/input/databbva2023/archive/balances.csv' where period = 'period_1' group by 1, 2,3, 4 ), balance2 as ( select ID, period, month, entity, sum( balance_amount) as entity_balance_amount , max(days_default) as entity_days_default from '/kaggle/input/databbva2023/archive/balances.csv' where period = 'period_1' group by 1, 2,3, 4 ), balance_tpe as (select balance1.* , balance2.entity, balance2.entity_balance_amount, balance2.entity_days_default from balance1 inner join balance2 on balance1.ID = balance2.ID and balance1.period = balance2.period and balance1.month = balance2.month) select balance_tpe.* from PO inner join balance_tpe on PO.ID = balance_tpe.ID and PO.period = balance_tpe.period limit 10
CPU times: user 1min 19s, sys: 2.78 s, total: 1min 22s Wall time: 22.5 s
ID | period | month | type_prod | type_prod_balance_amount | type_prod_days_default | entity | entity_balance_amount | entity_days_default | |
---|---|---|---|---|---|---|---|---|---|
0 | 1067c9690e7ebb5dcd0903bc5016c4eb0653409596b0fb... | period_1 | 2 | t12347_p24357 | 18.6978 | 0.0 | entity_3 | 26.3277 | 0.0 |
1 | b646894c39b1de25720cf5165f5761a25b53ef61bc1c94... | period_1 | 10 | t5p6 | 5.0000 | 0.0 | entity_1 | 5.0000 | 0.0 |
2 | 47d93d6c5747d23feedea5123aa5c80bed6edf8e227aac... | period_1 | 3 | t3p1 | 3.9822 | 0.0 | entity_1 | 3.3001 | 0.0 |
3 | 3350f41e644768a29abe0d9460f3a2dfb5e4c619cdcb8a... | period_1 | 7 | t3p1 | 10.5874 | 0.0 | entity_5 | 3.0631 | 0.0 |
4 | 08899d59087fab1b84cfee7c65d3b43a041b45d2474890... | period_1 | 3 | t3p1 | 4.1607 | 0.0 | entity_4 | 4.0559 | 0.0 |
!free -h
total used free shared buff/cache available Mem: 31Gi 1.7Gi 21Gi 0.0Ki 7.7Gi 29Gi Swap: 0B 0B 0B
%%time
#
# 'movements_sum_type_4_9'
#movements_std_type_1_12
tabla = movements
historia = 9
var1 = 'type_4'
alias = 'movements'
oper = 'sum' #'stddev_pop'
periodo = 'period_6'
query2 = f""" with
PO as (select ID, period from '{universe_path}'
where period = '{periodo}'),
tabla_h1 as (
select ID, period,
{oper}( case when month>=12-{historia} + 1 then {var1} end ) as {alias}_{oper}_{var1}_{historia}m
from '{tabla}'
where period = '{periodo}'
group by 1, 2 )
select PO.ID, PO.period,
tabla_h1.{alias}_{oper}_{var1}_{historia}m
from PO inner join tabla_h1
on PO.ID = tabla_h1.ID
and PO.period = tabla_h1.period
"""
print(query2)
df1 = duckdb.query(query2).to_df()
print(df1.head())
df1.describe()
with PO as (select ID, period from 'universe.csv' where period = 'period_6'), tabla_h1 as ( select ID, period, sum( case when month>=12-9 + 1 then type_4 end ) as movements_sum_type_4_9m from '/kaggle/input/databbva2023/archive/movements.csv' where period = 'period_6' group by 1, 2 ) select PO.ID, PO.period, tabla_h1.movements_sum_type_4_9m from PO inner join tabla_h1 on PO.ID = tabla_h1.ID and PO.period = tabla_h1.period
ID period \ 0 81310fcba917eb8ebb02d7c3cff0a6c46640199677ea75... period_6 1 46e0a2f68ae706098ff27d01b8c458d572ee1a679f79b6... period_6 2 099314cecc24929a92db299c11737105c1fd972f1fbbb9... period_6 3 85c9fdb77dcc2c8961fe992f3057fd428b3e0cb5134294... period_6 4 d1d8bda8ac618c329ba3d40f8ee0c7144085aed4b568be... period_6 movements_sum_type_4_9m 0 2.4029 1 2.1603 2 2.1233 3 3.0739 4 0.0000 CPU times: user 12.5 s, sys: 528 ms, total: 13 s Wall time: 4.31 s
movements_sum_type_4_9m | |
---|---|
count | 57754.000000 |
mean | 8.516584 |
std | 11.855139 |
min | 0.000000 |
25% | 0.000000 |
50% | 2.964400 |
75% | 13.077250 |
max | 45.000000 |