You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
147 lines
5.1 KiB
Python
147 lines
5.1 KiB
Python
#!/usr/bin/env python
|
|
# coding: utf-8
|
|
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.metrics import silhouette_score
|
|
from sqlalchemy import create_engine
|
|
from urllib.parse import quote_plus as urlquote
|
|
import datetime
|
|
import numpy as np
|
|
import pandas as pd
|
|
import time
|
|
|
|
print("==== start ====")
|
|
|
|
db_password = 'POMS_129@MySQL'
|
|
db_engine = create_engine(f'mysql+pymysql://poms:{urlquote(db_password)}@101.133.164.208:3306/dev_poms?charset=utf8')
|
|
|
|
sql = "SELECT `create_user` AS `account`, `point_uuid`, `point`, `create_time` FROM `sys_point_record` WHERE `audited` = 1"
|
|
raw_data = pd.read_sql(sql, db_engine)
|
|
|
|
df2 = raw_data.copy().fillna(0)
|
|
|
|
now = pd.to_datetime(datetime.datetime.now())
|
|
df2["recent"] = df2["create_time"].map(lambda x: (now - pd.to_datetime(x)).days)
|
|
recent_df = df2.groupby(by="account", as_index=False).agg({"recent": "min"})
|
|
|
|
frequency_df = df2.groupby(by="account", as_index=False).agg({"point_uuid": "count"})
|
|
mount_df = df2.groupby(by="account", as_index=False).agg({"point": "sum"})
|
|
rfm_df = recent_df.merge(frequency_df, on="account", how="left").merge(mount_df, on="account", how="left")
|
|
|
|
rfm_df2 = rfm_df.copy()
|
|
|
|
rfm_df2["point"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest')
|
|
|
|
recent_labels = [5, 4, 3, 2, 1]
|
|
mount_labels = [1, 2, 3, 4, 5]
|
|
frequency_bins = [1, 3, 5, 12]
|
|
frequency_labels = [1, 2, 3]
|
|
|
|
m_bins = rfm_df2["point"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest')
|
|
r_bins = rfm_df2["recent"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest')
|
|
rfm_df2["R"] = pd.cut(rfm_df2["recent"], bins=r_bins, labels=recent_labels, include_lowest=True)
|
|
rfm_df2["F"] = pd.cut(rfm_df2["point_uuid"], bins=frequency_bins, labels=frequency_labels, include_lowest=True)
|
|
rfm_df2["M"] = pd.cut(rfm_df2["point"], bins=m_bins, labels=mount_labels, include_lowest=True)
|
|
|
|
rfm_df2 = rfm_df2.drop(['recent', 'point_uuid', 'point'], axis=1)
|
|
|
|
rfm_df2 = rfm_df2.fillna(3)
|
|
rfm_df2 = rfm_df2.astype(int)
|
|
|
|
rfm = rfm_df2[['R', 'F', 'M']]
|
|
normalization_rfm = rfm.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
|
|
|
|
normalization_rfm = normalization_rfm.fillna(0)
|
|
normalization_rfm
|
|
|
|
clusters = [2, 3, 4, 5, 6, 7, 8]
|
|
|
|
# 手肘法来确定最优K值
|
|
SSE = [] # 误差平方和
|
|
SC = [] # 轮廓系数
|
|
for i in clusters:
|
|
kmeans = KMeans(n_clusters=i)
|
|
kmeans.fit(normalization_rfm)
|
|
SSE.append(kmeans.inertia_)
|
|
|
|
# 计算曲率
|
|
x_t = np.gradient(clusters)
|
|
y_t = np.gradient(SSE)
|
|
|
|
vel = np.array([[x_t[i], y_t[i]] for i in range(x_t.size)])
|
|
|
|
speed = np.sqrt(x_t * x_t + y_t * y_t)
|
|
|
|
tangent = np.array([1 / speed] * 2).transpose() * vel
|
|
|
|
tangent_x = tangent[:, 0]
|
|
tangent_y = tangent[:, 1]
|
|
|
|
d_tangent_x = np.gradient(tangent_x)
|
|
d_tangent_y = np.gradient(tangent_y)
|
|
|
|
dT_dt = np.array([[d_tangent_x[i], d_tangent_y[i]] for i in range(d_tangent_x.size)])
|
|
|
|
length_dT_dt = np.sqrt(d_tangent_x * d_tangent_x + d_tangent_y * d_tangent_y)
|
|
|
|
normal = np.array([1 / length_dT_dt] * 2).transpose() * dT_dt
|
|
|
|
d2s_dt2 = np.gradient(speed)
|
|
d2x_dt2 = np.gradient(x_t)
|
|
d2y_dt2 = np.gradient(y_t)
|
|
|
|
curvature = np.abs(d2x_dt2 * y_t - x_t * d2y_dt2) / (x_t * x_t + y_t * y_t) ** 1.5
|
|
t_component = np.array([d2s_dt2] * 2).transpose()
|
|
n_component = np.array([curvature * speed * speed] * 2).transpose()
|
|
|
|
acceleration = t_component * tangent + n_component * normal
|
|
|
|
# In[24]:
|
|
|
|
|
|
normalization_acceleration = acceleration
|
|
normalization_acceleration[:, 1] = (acceleration[:, 1] - np.min(acceleration[:, 1])) / (
|
|
np.max(acceleration[:, 1]) - np.min(acceleration[:, 1]))
|
|
normalization_curvature = normalization_acceleration[:, 1]
|
|
|
|
SCORE = []
|
|
for i in clusters:
|
|
kmeans = KMeans(n_clusters=i)
|
|
result = kmeans.fit_predict(normalization_rfm)
|
|
score = silhouette_score(normalization_rfm, result)
|
|
SCORE.append(score)
|
|
|
|
tup_list = []
|
|
for i in range(7):
|
|
item = (clusters[i], normalization_curvature[i] * 0.1 + SCORE[i] * 0.9)
|
|
tup_list.append(item)
|
|
tup_list = sorted(tup_list, key=lambda x: x[1], reverse=True)
|
|
|
|
k = tup_list[0][0]
|
|
k_model = KMeans(n_clusters=k)
|
|
k_model.fit(normalization_rfm)
|
|
|
|
quantity = pd.Series(k_model.labels_).value_counts()
|
|
|
|
res0Series = pd.Series(k_model.labels_)
|
|
message = ["", "", "据分析,您近期的表现不佳,请注意!", "据分析,您近期的表现存在严重问题,请及时整改!"]
|
|
query_sql = "SELECT `account` FROM `sys_user`"
|
|
query_data = pd.read_sql(query_sql, db_engine)
|
|
for i in range(0, k):
|
|
res0 = res0Series[res0Series.values == i]
|
|
temp_list = []
|
|
for account in rfm_df2['account'].iloc[res0.index].values:
|
|
if str(account) in query_data["account"].values:
|
|
temp_list.append(str(account))
|
|
if i >= 2:
|
|
insert_sql = "INSERT INTO `sys_message` (`account`, `title`, `content`, `create_user`, `create_time`) VALUES ('" + str(
|
|
account) + "', '分析结果提醒', '" + message[i] + "', 'admin', '" + str(
|
|
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) + "')"
|
|
db_engine.execute(insert_sql)
|
|
if len(temp_list) > 0:
|
|
update_sql = "UPDATE `sys_user` SET `persona` = " + str(i) + " WHERE `account` IN (" + str(temp_list).replace(
|
|
"[", "").replace("]", "") + ")"
|
|
db_engine.execute(update_sql)
|
|
|
|
print("==== end ====")
|