-- CREDIT CARD ANALYSIS BY CLIENT PAYMENT PATTERNS AND FEATURE ENGINEERING
-- Columns in the dataset, column type, and NULL values
SELECT
column_name,
data_type,
is_nullable
FROM
`bigquery-public-data.ml_datasets.INFORMATION_SCHEMA.COLUMNS`
WHERE
table_name = 'credit_card_default';
-- Data distribution and standard deviation
SELECT
AVG(limit_balance) AS avg_limit_balance,
MIN(limit_balance) AS min_limit_balance,
MAX(limit_balance) AS max_limit_balance,
STDDEV(limit_balance) AS stddev_limit_balance,
AVG(age) AS avg_age,
MIN(age) AS min_age,
MAX(age) AS max_age,
STDDEV(age) AS stddev_age
FROM `bigquery-public-data.ml_datasets.credit_card_default`;
-- Check for missing values
SELECT
SUM(CASE WHEN limit_balance IS NULL THEN 1 ELSE 0 END) AS null_limit_balance,
SUM(CASE WHEN sex IS NULL THEN 1 ELSE 0 END) AS null_sex,
SUM(CASE WHEN education_level IS NULL THEN 1 ELSE 0 END) AS null_education_level,
SUM(CASE WHEN marital_status IS NULL THEN 1 ELSE 0 END) AS null_marital_status
FROM `bigquery-public-data.ml_datasets.credit_card_default`;
-- Calculate the mean average balance
SELECT AVG(limit_balance) AS avg_limit_balance
FROM `bigquery-public-data.ml_datasets.credit_card_default`;
-- Analyze top and IQR, outliers for limit_balance
SELECT
APPROX_QUANTILES(limit_balance, 4)[OFFSET(1)] AS Q1,
APPROX_QUANTILES(limit_balance, 4)[OFFSET(3)] AS Q3
FROM `bigquery-public-data.ml_datasets.credit_card_default`;
-- Calculate the 'upper' and 'lower' bounds for limit_balance
SELECT
Q1 - 1.5 * (Q3 - Q1) AS lower_bound,
Q3 + 1.5 * (Q3 - Q1) AS upper_bound
FROM (
SELECT
APPROX_QUANTILES(limit_balance, 4)[OFFSET(1)] AS Q1,
APPROX_QUANTILES(limit_balance, 4)[OFFSET(3)] AS Q3
FROM `bigquery-public-data.ml_datasets.credit_card_default`
);
-- Handle gender values
SELECT DISTINCT sex
FROM `bigquery-public-data.ml_datasets.credit_card_default`
WHERE sex NOT IN ('1', '2');
-- PAYMENT PATTERNS AND FEATURE ENGINEERING
-- Repayment patterns
SELECT pay_0, COUNT(*) AS count_sept
FROM `bigquery-public-data.ml_datasets.credit_card_default`
GROUP BY pay_0
ORDER BY pay_0;
-- Default payments
SELECT
default_payment_next_month,
AVG(limit_balance) AS avg_credit_limit,
AVG(bill_amt_1) AS avg_bill_sept
FROM `bigquery-public-data.ml_datasets.credit_card_default`
GROUP BY default_payment_next_month;
-- Monthly payment consistency
SELECT
id,
(bill_amt_1 - pay_amt_1) AS balance_sept,
(bill_amt_2 - pay_amt_2) AS balance_aug
FROM `bigquery-public-data.ml_datasets.credit_card_default`;
-- Average payment delay by client
SELECT
id,
AVG(
SAFE_CAST(pay_0 AS FLOAT64) +
SAFE_CAST(pay_2 AS FLOAT64) +
SAFE_CAST(pay_3 AS FLOAT64) +
SAFE_CAST(pay_4 AS FLOAT64) +
SAFE_CAST(pay_5 AS FLOAT64) +
SAFE_CAST(pay_6 AS FLOAT64)
) AS avg_payment_delay
FROM `bigquery-public-data.ml_datasets.credit_card_default`
GROUP BY id;
-- Current outstanding balance per client
SELECT
(bill_amt_1 + bill_amt_2 + bill_amt_3 + bill_amt_4 + bill_amt_5 + bill_amt_6) AS total_outstanding_balance
FROM `bigquery-public-data.ml_datasets.credit_card_default`
ORDER BY total_outstanding_balance DESC;
-- Credit default by education level and marital status
SELECT
CASE
WHEN education_level = '1' THEN 'Graduate School'
WHEN education_level = '2' THEN 'University'
WHEN education_level = '3' THEN 'High School'
WHEN education_level = '4' THEN 'Other'
WHEN education_level IN ('5', '6') THEN 'Unknown'
ELSE 'Unspecified'
END AS education_level_label,
CASE
WHEN marital_status = '1' THEN 'Married'
WHEN marital_status = '2' THEN 'Single'
WHEN marital_status = '3' THEN 'Other'
ELSE 'Unknown'
END AS marital_status_label,
AVG(CAST(default_payment_next_month AS FLOAT64)) AS default_rate
FROM `bigquery-public-data.ml_datasets.credit_card_default`
GROUP BY education_level_label, marital_status_label
ORDER BY default_rate DESC;
-- HIGH RISK Clients based on prediction score
SELECT
id,
predicted_default_payment_next_month.tables.score AS predicted_default_score
FROM `bigquery-public-data.ml_datasets.credit_card_default`
WHERE predicted_default_score > 0.8
ORDER BY predicted_default_score DESC;