fork download
  1. import numpy as np
  2. import pandas as pd
  3. from sklearn.preprocessing import StandardScaler, OneHotEncoder
  4. from sklearn.cluster import KMeans
  5. from sklearn.compose import ColumnTransformer
  6. from sklearn.pipeline import Pipeline
  7. import matplotlib.pyplot as plt
  8.  
  9. # Generate sample data
  10. np.random.seed(42)
  11.  
  12. def generate_sample_data(n_samples=1000):
  13. data = {
  14. 'customer_id': range(1, n_samples + 1),
  15. 'recency': np.random.randint(1, 365, n_samples),
  16. 'frequency': np.random.randint(1, 20, n_samples),
  17. 'monetary_value': np.random.randint(50, 2000, n_samples),
  18. 'favorite_category': np.random.choice(['shoes', 'underwear', 'accessories', 'shirts', 'blouses', 'pants', 'jackets', 'sweaters', 'jewelry'], n_samples),
  19. 'items_purchased': np.random.randint(1, 50, n_samples)
  20. }
  21. return pd.DataFrame(data)
  22.  
  23. # Create sample dataset
  24. df = generate_sample_data()
  25.  
  26. # Preprocessing
  27. numeric_features = ['recency', 'frequency', 'monetary_value', 'items_purchased']
  28. categorical_features = ['favorite_category']
  29.  
  30. preprocessor = ColumnTransformer(
  31. transformers=[
  32. ('num', StandardScaler(), numeric_features),
  33. ('cat', OneHotEncoder(drop='first', sparse=False), categorical_features)
  34. ])
  35.  
  36. # Create pipeline
  37. pipeline = Pipeline([
  38. ('preprocessor', preprocessor),
  39. ('kmeans', KMeans(n_clusters=3, random_state=42))
  40. ])
  41.  
  42. # Fit the pipeline
  43. pipeline.fit(df)
  44.  
  45. # Get cluster labels
  46. df['cluster'] = pipeline.predict(df)
  47.  
  48. # Compute insights about the three customer groups
  49. def compute_insights(df):
  50. insights = df.groupby('cluster').agg({
  51. 'recency': 'mean',
  52. 'frequency': 'mean',
  53. 'monetary_value': 'mean',
  54. 'items_purchased': 'mean',
  55. 'favorite_category': lambda x: x.value_counts().index[0]
  56. }).reset_index()
  57.  
  58. insights.columns = ['Cluster', 'Avg Recency (days)', 'Avg Frequency (quarter)', 'Avg Monetary Value ($)', 'Avg Items Purchased', 'Most Common Category']
  59. return insights
  60.  
  61. insights = compute_insights(df)
  62. print(insights)
  63.  
  64. # Visualize clusters
  65. plt.figure(figsize=(10, 6))
  66. scatter = plt.scatter(df['recency'], df['monetary_value'], c=df['cluster'], cmap='viridis')
  67. plt.colorbar(scatter)
  68. plt.xlabel('Recency (days)')
  69. plt.ylabel('Monetary Value ($)')
  70. plt.title('Customer Segments: Recency vs Monetary Value')
  71. plt.show()
  72.  
  73. # Function to classify new customers
  74. def classify_customer(customer_data, pipeline):
  75. customer_df = pd.DataFrame([customer_data])
  76. cluster = pipeline.predict(customer_df)[0]
  77. return f"Customer belongs to Cluster {cluster}"
  78.  
  79. # Example usage
  80. new_customer = {
  81. 'recency': 30,
  82. 'frequency': 5,
  83. 'monetary_value': 500,
  84. 'favorite_category': 'shoes',
  85. 'items_purchased': 10
  86. }
  87.  
  88. print(classify_customer(new_customer, pipeline))
Success #stdin #stdout 2.9s 135012KB
stdin
Standard input is empty
stdout
   Cluster  Avg Recency (days)  ...  Avg Items Purchased  Most Common Category
0        0          204.763473  ...            24.308383              sweaters
1        1          167.557692  ...            24.714286           accessories
2        2          172.158940  ...            26.615894             underwear

[3 rows x 6 columns]
Customer belongs to Cluster 2