#!/usr/bin/env python # coding: utf-8 from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score from sqlalchemy import create_engine from urllib.parse import quote_plus as urlquote import datetime import numpy as np import pandas as pd import time print("==== start ====") db_password = 'POMS_129@MySQL' db_engine = create_engine(f'mysql+pymysql://poms:{urlquote(db_password)}@101.133.164.208:3306/dev_poms?charset=utf8') sql = "SELECT `create_user` AS `account`, `point_uuid`, `point`, `create_time` FROM `sys_point_record` WHERE `audited` = 1" raw_data = pd.read_sql(sql, db_engine) df2 = raw_data.copy().fillna(0) now = pd.to_datetime(datetime.datetime.now()) df2["recent"] = df2["create_time"].map(lambda x: (now - pd.to_datetime(x)).days) recent_df = df2.groupby(by="account", as_index=False).agg({"recent": "min"}) frequency_df = df2.groupby(by="account", as_index=False).agg({"point_uuid": "count"}) mount_df = df2.groupby(by="account", as_index=False).agg({"point": "sum"}) rfm_df = recent_df.merge(frequency_df, on="account", how="left").merge(mount_df, on="account", how="left") rfm_df2 = rfm_df.copy() rfm_df2["point"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest') recent_labels = [5, 4, 3, 2, 1] mount_labels = [1, 2, 3, 4, 5] frequency_bins = [1, 3, 5, 12] frequency_labels = [1, 2, 3] m_bins = rfm_df2["point"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest') r_bins = rfm_df2["recent"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest') rfm_df2["R"] = pd.cut(rfm_df2["recent"], bins=r_bins, labels=recent_labels, include_lowest=True) rfm_df2["F"] = pd.cut(rfm_df2["point_uuid"], bins=frequency_bins, labels=frequency_labels, include_lowest=True) rfm_df2["M"] = pd.cut(rfm_df2["point"], bins=m_bins, labels=mount_labels, include_lowest=True) rfm_df2 = rfm_df2.drop(['recent', 'point_uuid', 'point'], axis=1) rfm_df2 = rfm_df2.fillna(3) rfm_df2 = rfm_df2.astype(int) rfm = rfm_df2[['R', 'F', 'M']] normalization_rfm = rfm.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) normalization_rfm = normalization_rfm.fillna(0) normalization_rfm clusters = [2, 3, 4, 5, 6, 7, 8] # 手肘法来确定最优K值 SSE = [] # 误差平方和 SC = [] # 轮廓系数 for i in clusters: kmeans = KMeans(n_clusters=i) kmeans.fit(normalization_rfm) SSE.append(kmeans.inertia_) # 计算曲率 x_t = np.gradient(clusters) y_t = np.gradient(SSE) vel = np.array([[x_t[i], y_t[i]] for i in range(x_t.size)]) speed = np.sqrt(x_t * x_t + y_t * y_t) tangent = np.array([1 / speed] * 2).transpose() * vel tangent_x = tangent[:, 0] tangent_y = tangent[:, 1] d_tangent_x = np.gradient(tangent_x) d_tangent_y = np.gradient(tangent_y) dT_dt = np.array([[d_tangent_x[i], d_tangent_y[i]] for i in range(d_tangent_x.size)]) length_dT_dt = np.sqrt(d_tangent_x * d_tangent_x + d_tangent_y * d_tangent_y) normal = np.array([1 / length_dT_dt] * 2).transpose() * dT_dt d2s_dt2 = np.gradient(speed) d2x_dt2 = np.gradient(x_t) d2y_dt2 = np.gradient(y_t) curvature = np.abs(d2x_dt2 * y_t - x_t * d2y_dt2) / (x_t * x_t + y_t * y_t) ** 1.5 t_component = np.array([d2s_dt2] * 2).transpose() n_component = np.array([curvature * speed * speed] * 2).transpose() acceleration = t_component * tangent + n_component * normal # In[24]: normalization_acceleration = acceleration normalization_acceleration[:, 1] = (acceleration[:, 1] - np.min(acceleration[:, 1])) / ( np.max(acceleration[:, 1]) - np.min(acceleration[:, 1])) normalization_curvature = normalization_acceleration[:, 1] SCORE = [] for i in clusters: kmeans = KMeans(n_clusters=i) result = kmeans.fit_predict(normalization_rfm) score = silhouette_score(normalization_rfm, result) SCORE.append(score) tup_list = [] for i in range(7): item = (clusters[i], normalization_curvature[i] * 0.1 + SCORE[i] * 0.9) tup_list.append(item) tup_list = sorted(tup_list, key=lambda x: x[1], reverse=True) k = tup_list[0][0] k_model = KMeans(n_clusters=k) k_model.fit(normalization_rfm) quantity = pd.Series(k_model.labels_).value_counts() res0Series = pd.Series(k_model.labels_) message = ["", "", "据分析,您近期的表现不佳,请注意!", "据分析,您近期的表现存在严重问题,请及时整改!"] query_sql = "SELECT `account` FROM `sys_user`" query_data = pd.read_sql(query_sql, db_engine) for i in range(0, k): res0 = res0Series[res0Series.values == i] temp_list = [] for account in rfm_df2['account'].iloc[res0.index].values: if str(account) in query_data["account"].values: temp_list.append(str(account)) if i >= 2: insert_sql = "INSERT INTO `sys_message` (`account`, `title`, `content`, `create_user`, `create_time`) VALUES ('" + str( account) + "', '分析结果提醒', '" + message[i] + "', 'admin', '" + str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) + "')" db_engine.execute(insert_sql) if len(temp_list) > 0: update_sql = "UPDATE `sys_user` SET `persona` = " + str(i) + " WHERE `account` IN (" + str(temp_list).replace( "[", "").replace("]", "") + ")" db_engine.execute(update_sql) print("==== end ====")