#Cell 1: Import packages import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report import xgboost as xgb ==================================================================== #Cell 2: Load and transform data #Step 1: All necessary data #Load raw revenue tabe revenue_spark_df = spark.table("finmgmtcourse.finmgmtdata.revenue") invoices = revenue_spark_df.toPandas() queried_spark_df = spark.table("finmgmtcourse.finmgmtdata.queried_invoices") queries = queried_spark_df.toPandas() delphi_spark_df = spark.table("finmgmtcourse.finmgmtdata.delphi_scores") delphi = delphi_spark_df.toPandas() invoices = pd.merge(invoices, delphi, left_on='Client', right_on='Client', how='left') # Mark queried invoices queries["flag"] = 1 # Join queried labels onto invoices data = pd.merge(invoices, queries, left_on='Invoice Number', right_on='Queried', how='left') # Missing values in queried → 0 data["flag"] = data["flag"].fillna(0) display(data) ==================================================================== #Cell 3: One hot encode data = pd.get_dummies(data, columns=['Product'], drop_first=True) ==================================================================== #Cell 4: Prepare data # Identify latest month data["Month"] = pd.to_datetime(data["Month"], format="%m/%d/%Y") data["year_month"] = data["Month"].dt.to_period("M") latest_month = data["year_month"].max() print("Latest month identified:", latest_month) # Split into: # - training set = all prior months # - scoring set = latest month only train_data = data[data["year_month"] < latest_month].copy() score_data = data[data["year_month"] == latest_month].copy() # Features and target feature_cols = ["Price", "Revenue","Delphi","Product_Prd-CHIP-H-0002", 'Product_Prd-CHIP-H-0003', 'Product_Prd-CHIP-S-0004', 'Product_Prd-CHIP-S-0005', 'Product_Prd-CHIP-S-0006', 'Product_Prd-CHIP-S-0007', 'Product_Prd-CHIP-S-0008', 'Product_Prd-LICS-0009', 'Product_Prd-LICS-0010'] # replace with your own X_train = train_data[feature_cols] y_train = train_data["flag"] X_score = score_data[feature_cols] ==================================================================== # Cell 5: Fit models # Train/Test split (only for evaluation) X_tr, X_te, y_tr, y_te = train_test_split( X_train, y_train, test_size=0.2, random_state=42 ) # 1) Logistic Regression log_reg = LogisticRegression(max_iter=500, class_weight='balanced') log_reg.fit(X_tr, y_tr) print("\n=== Logistic Regression Evaluation ===") y_pred_lr = log_reg.predict(X_te) print(classification_report(y_te, y_pred_lr)) # 2) XGBoost Classifier # Handle class imbalance pos = sum(y_train) neg = len(y_train) - pos scale_pos_weight = neg / pos xgb_model = xgb.XGBClassifier( n_estimators=300, max_depth=5, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=scale_pos_weight, eval_metric='logloss' ) xgb_model.fit(X_tr, y_tr) print("\n=== XGBoost Evaluation ===") y_pred_xgb = xgb_model.predict(X_te) print(classification_report(y_te, y_pred_xgb)) ==================================================================== #Cell 5: Score probability # Predict probabilities for latest month invoices score_data["prob_xgb"] = xgb_model.predict_proba(X_score)[:, 1] # Save or inspect results score_output = score_data[["Invoice Number","Invoice Date", "prob_xgb"]] display(score_output)