You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

147 lines
5.1 KiB
Python

#!/usr/bin/env python
# coding: utf-8
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sqlalchemy import create_engine
from urllib.parse import quote_plus as urlquote
import datetime
import numpy as np
import pandas as pd
import time
print("==== start ====")
db_password = 'POMS_129@MySQL'
db_engine = create_engine(f'mysql+pymysql://poms:{urlquote(db_password)}@101.133.164.208:3306/dev_poms?charset=utf8')
sql = "SELECT `create_user` AS `account`, `point_uuid`, `point`, `create_time` FROM `sys_point_record` WHERE `audited` = 1"
raw_data = pd.read_sql(sql, db_engine)
df2 = raw_data.copy().fillna(0)
now = pd.to_datetime(datetime.datetime.now())
df2["recent"] = df2["create_time"].map(lambda x: (now - pd.to_datetime(x)).days)
recent_df = df2.groupby(by="account", as_index=False).agg({"recent": "min"})
frequency_df = df2.groupby(by="account", as_index=False).agg({"point_uuid": "count"})
mount_df = df2.groupby(by="account", as_index=False).agg({"point": "sum"})
rfm_df = recent_df.merge(frequency_df, on="account", how="left").merge(mount_df, on="account", how="left")
rfm_df2 = rfm_df.copy()
rfm_df2["point"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest')
recent_labels = [5, 4, 3, 2, 1]
mount_labels = [1, 2, 3, 4, 5]
frequency_bins = [1, 3, 5, 12]
frequency_labels = [1, 2, 3]
m_bins = rfm_df2["point"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest')
r_bins = rfm_df2["recent"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest')
rfm_df2["R"] = pd.cut(rfm_df2["recent"], bins=r_bins, labels=recent_labels, include_lowest=True)
rfm_df2["F"] = pd.cut(rfm_df2["point_uuid"], bins=frequency_bins, labels=frequency_labels, include_lowest=True)
rfm_df2["M"] = pd.cut(rfm_df2["point"], bins=m_bins, labels=mount_labels, include_lowest=True)
rfm_df2 = rfm_df2.drop(['recent', 'point_uuid', 'point'], axis=1)
rfm_df2 = rfm_df2.fillna(3)
rfm_df2 = rfm_df2.astype(int)
rfm = rfm_df2[['R', 'F', 'M']]
normalization_rfm = rfm.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
normalization_rfm = normalization_rfm.fillna(0)
normalization_rfm
clusters = [2, 3, 4, 5, 6, 7, 8]
# 手肘法来确定最优K值
SSE = [] # 误差平方和
SC = [] # 轮廓系数
for i in clusters:
kmeans = KMeans(n_clusters=i)
kmeans.fit(normalization_rfm)
SSE.append(kmeans.inertia_)
# 计算曲率
x_t = np.gradient(clusters)
y_t = np.gradient(SSE)
vel = np.array([[x_t[i], y_t[i]] for i in range(x_t.size)])
speed = np.sqrt(x_t * x_t + y_t * y_t)
tangent = np.array([1 / speed] * 2).transpose() * vel
tangent_x = tangent[:, 0]
tangent_y = tangent[:, 1]
d_tangent_x = np.gradient(tangent_x)
d_tangent_y = np.gradient(tangent_y)
dT_dt = np.array([[d_tangent_x[i], d_tangent_y[i]] for i in range(d_tangent_x.size)])
length_dT_dt = np.sqrt(d_tangent_x * d_tangent_x + d_tangent_y * d_tangent_y)
normal = np.array([1 / length_dT_dt] * 2).transpose() * dT_dt
d2s_dt2 = np.gradient(speed)
d2x_dt2 = np.gradient(x_t)
d2y_dt2 = np.gradient(y_t)
curvature = np.abs(d2x_dt2 * y_t - x_t * d2y_dt2) / (x_t * x_t + y_t * y_t) ** 1.5
t_component = np.array([d2s_dt2] * 2).transpose()
n_component = np.array([curvature * speed * speed] * 2).transpose()
acceleration = t_component * tangent + n_component * normal
# In[24]:
normalization_acceleration = acceleration
normalization_acceleration[:, 1] = (acceleration[:, 1] - np.min(acceleration[:, 1])) / (
np.max(acceleration[:, 1]) - np.min(acceleration[:, 1]))
normalization_curvature = normalization_acceleration[:, 1]
SCORE = []
for i in clusters:
kmeans = KMeans(n_clusters=i)
result = kmeans.fit_predict(normalization_rfm)
score = silhouette_score(normalization_rfm, result)
SCORE.append(score)
tup_list = []
for i in range(7):
item = (clusters[i], normalization_curvature[i] * 0.1 + SCORE[i] * 0.9)
tup_list.append(item)
tup_list = sorted(tup_list, key=lambda x: x[1], reverse=True)
k = tup_list[0][0]
k_model = KMeans(n_clusters=k)
k_model.fit(normalization_rfm)
quantity = pd.Series(k_model.labels_).value_counts()
res0Series = pd.Series(k_model.labels_)
message = ["", "", "据分析,您近期的表现不佳,请注意!", "据分析,您近期的表现存在严重问题,请及时整改!"]
query_sql = "SELECT `account` FROM `sys_user`"
query_data = pd.read_sql(query_sql, db_engine)
for i in range(0, k):
res0 = res0Series[res0Series.values == i]
temp_list = []
for account in rfm_df2['account'].iloc[res0.index].values:
if str(account) in query_data["account"].values:
temp_list.append(str(account))
if i >= 2:
insert_sql = "INSERT INTO `sys_message` (`account`, `title`, `content`, `create_user`, `create_time`) VALUES ('" + str(
account) + "', '分析结果提醒', '" + message[i] + "', 'admin', '" + str(
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) + "')"
db_engine.execute(insert_sql)
if len(temp_list) > 0:
update_sql = "UPDATE `sys_user` SET `persona` = " + str(i) + " WHERE `account` IN (" + str(temp_list).replace(
"[", "").replace("]", "") + ")"
db_engine.execute(update_sql)
print("==== end ====")