You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1470 lines
84 KiB
Plaintext

2 years ago
{
"cells": [
{
"cell_type": "markdown",
"id": "fecd214a",
"metadata": {},
"source": [
"python 3.6.13\n",
"matplotlib 3.3.4\n",
"numpy 1.19.2\n",
"pandas 1.1.5\n",
"scikit-learn 0.24.2\n",
"seaborn 0.11.2\n",
"sqlalchemy 1.4.22"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "12d24a6c",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import silhouette_score\n",
"from sqlalchemy import create_engine\n",
"from urllib.parse import quote_plus as urlquote\n",
"import datetime\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "121a3f95",
"metadata": {
"ExecuteTime": {
"end_time": "2022-01-05T07:53:31.824351Z",
"start_time": "2022-01-05T07:53:31.780403Z"
}
},
"outputs": [],
"source": [
"db_password = 'POMS_129@MySQL'\n",
"db_engine = create_engine(f'mysql+pymysql://poms:{urlquote(db_password)}@101.133.164.208:3306/dev_poms?charset=utf8')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "faa12069",
"metadata": {
"ExecuteTime": {
"end_time": "2022-01-05T07:53:32.314051Z",
"start_time": "2022-01-05T07:53:31.827278Z"
}
},
"outputs": [],
"source": [
"sql = \"SELECT `create_user` AS `account`, `point_uuid`, `point`, `create_time` FROM `sys_point_record` WHERE `audited` = 1\"\n",
"raw_data = pd.read_sql(sql, db_engine)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8120b8c2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>account</th>\n",
" <th>point_uuid</th>\n",
" <th>point</th>\n",
" <th>create_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1951134</td>\n",
" <td>681cb6da8c9a4984ab6bf5c39bc9cfdc</td>\n",
" <td>9</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2051111</td>\n",
" <td>8312dac5b3cf5d93b511b1111a9aad30</td>\n",
" <td>6</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1951122</td>\n",
" <td>b2f8e1aa52697d4088f0c3d27cf5fd01</td>\n",
" <td>2</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2051110</td>\n",
" <td>afe319c5b9cec39700d219bc2cb55137</td>\n",
" <td>4</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1951122</td>\n",
" <td>afe319c5b9cec39700d219bc2cb55137</td>\n",
" <td>2</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76045</th>\n",
" <td>5396174</td>\n",
" <td>b2f8e1aa52697d4088f0c3d27cf5fd01</td>\n",
" <td>7</td>\n",
" <td>2021-12-29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76046</th>\n",
" <td>2051107</td>\n",
" <td>a373d0b08d8f3698f94c4a81d070bc3d</td>\n",
" <td>9</td>\n",
" <td>2021-12-29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76047</th>\n",
" <td>2051102</td>\n",
" <td>dde9d4a8db4ec74a39e3ceae0734815f</td>\n",
" <td>4</td>\n",
" <td>2021-12-29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76048</th>\n",
" <td>1951130</td>\n",
" <td>6f99a016eb26d149cc3eb2ef4ea5bba8</td>\n",
" <td>3</td>\n",
" <td>2021-12-29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76049</th>\n",
" <td>1951129</td>\n",
" <td>136e05d5470d9e843d699e1bc10bd17a</td>\n",
" <td>5</td>\n",
" <td>2021-12-29</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>76050 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" account point_uuid point create_time\n",
"0 1951134 681cb6da8c9a4984ab6bf5c39bc9cfdc 9 2021-01-01\n",
"1 2051111 8312dac5b3cf5d93b511b1111a9aad30 6 2021-01-01\n",
"2 1951122 b2f8e1aa52697d4088f0c3d27cf5fd01 2 2021-01-01\n",
"3 2051110 afe319c5b9cec39700d219bc2cb55137 4 2021-01-01\n",
"4 1951122 afe319c5b9cec39700d219bc2cb55137 2 2021-01-01\n",
"... ... ... ... ...\n",
"76045 5396174 b2f8e1aa52697d4088f0c3d27cf5fd01 7 2021-12-29\n",
"76046 2051107 a373d0b08d8f3698f94c4a81d070bc3d 9 2021-12-29\n",
"76047 2051102 dde9d4a8db4ec74a39e3ceae0734815f 4 2021-12-29\n",
"76048 1951130 6f99a016eb26d149cc3eb2ef4ea5bba8 3 2021-12-29\n",
"76049 1951129 136e05d5470d9e843d699e1bc10bd17a 5 2021-12-29\n",
"\n",
"[76050 rows x 4 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 看一眼\n",
"raw_data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c1954360",
"metadata": {
"ExecuteTime": {
"end_time": "2022-01-05T07:53:32.439640Z",
"start_time": "2022-01-05T07:53:32.365840Z"
}
},
"outputs": [],
"source": [
"# 去除缺失值\n",
"df2 = raw_data.copy().fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1f387b2f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>account</th>\n",
" <th>point_uuid</th>\n",
" <th>point</th>\n",
" <th>create_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1951134</td>\n",
" <td>681cb6da8c9a4984ab6bf5c39bc9cfdc</td>\n",
" <td>9</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2051111</td>\n",
" <td>8312dac5b3cf5d93b511b1111a9aad30</td>\n",
" <td>6</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1951122</td>\n",
" <td>b2f8e1aa52697d4088f0c3d27cf5fd01</td>\n",
" <td>2</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2051110</td>\n",
" <td>afe319c5b9cec39700d219bc2cb55137</td>\n",
" <td>4</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1951122</td>\n",
" <td>afe319c5b9cec39700d219bc2cb55137</td>\n",
" <td>2</td>\n",
" <td>2021-01-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76045</th>\n",
" <td>5396174</td>\n",
" <td>b2f8e1aa52697d4088f0c3d27cf5fd01</td>\n",
" <td>7</td>\n",
" <td>2021-12-29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76046</th>\n",
" <td>2051107</td>\n",
" <td>a373d0b08d8f3698f94c4a81d070bc3d</td>\n",
" <td>9</td>\n",
" <td>2021-12-29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76047</th>\n",
" <td>2051102</td>\n",
" <td>dde9d4a8db4ec74a39e3ceae0734815f</td>\n",
" <td>4</td>\n",
" <td>2021-12-29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76048</th>\n",
" <td>1951130</td>\n",
" <td>6f99a016eb26d149cc3eb2ef4ea5bba8</td>\n",
" <td>3</td>\n",
" <td>2021-12-29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76049</th>\n",
" <td>1951129</td>\n",
" <td>136e05d5470d9e843d699e1bc10bd17a</td>\n",
" <td>5</td>\n",
" <td>2021-12-29</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>76050 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" account point_uuid point create_time\n",
"0 1951134 681cb6da8c9a4984ab6bf5c39bc9cfdc 9 2021-01-01\n",
"1 2051111 8312dac5b3cf5d93b511b1111a9aad30 6 2021-01-01\n",
"2 1951122 b2f8e1aa52697d4088f0c3d27cf5fd01 2 2021-01-01\n",
"3 2051110 afe319c5b9cec39700d219bc2cb55137 4 2021-01-01\n",
"4 1951122 afe319c5b9cec39700d219bc2cb55137 2 2021-01-01\n",
"... ... ... ... ...\n",
"76045 5396174 b2f8e1aa52697d4088f0c3d27cf5fd01 7 2021-12-29\n",
"76046 2051107 a373d0b08d8f3698f94c4a81d070bc3d 9 2021-12-29\n",
"76047 2051102 dde9d4a8db4ec74a39e3ceae0734815f 4 2021-12-29\n",
"76048 1951130 6f99a016eb26d149cc3eb2ef4ea5bba8 3 2021-12-29\n",
"76049 1951129 136e05d5470d9e843d699e1bc10bd17a 5 2021-12-29\n",
"\n",
"[76050 rows x 4 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 看一眼\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c423e07e",
"metadata": {
"ExecuteTime": {
"end_time": "2022-01-05T07:53:41.383720Z",
"start_time": "2022-01-05T07:53:32.508456Z"
}
},
"outputs": [],
"source": [
"## Recent【最近一次消费数据】\n",
"now = pd.to_datetime(datetime.datetime.now())\n",
"# 添加时间差数据\n",
"df2[\"recent\"] = df2[\"create_time\"].map(lambda x: (now - pd.to_datetime(x)).days)\n",
"# 用户最近一次购买商品的时间\n",
"recent_df = df2.groupby(by=\"account\", as_index=False).agg({\"recent\": \"min\"})"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "dc957d34",
"metadata": {},
"outputs": [],
"source": [
"## Frequency【一段时间内的消费次数】\n",
"frequency_df = df2.groupby(by=\"account\", as_index=False).agg({\"point_uuid\": \"count\"})"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "308bc08d",
"metadata": {
"ExecuteTime": {
"end_time": "2022-01-05T07:53:41.636044Z",
"start_time": "2022-01-05T07:53:41.593163Z"
}
},
"outputs": [],
"source": [
"## Mount【一段时间内的消费总金额】\n",
"mount_df = df2.groupby(by=\"account\", as_index=False).agg({\"point\": \"sum\"})"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "de978fb4",
"metadata": {
"ExecuteTime": {
"end_time": "2022-01-05T07:53:41.760711Z",
"start_time": "2022-01-05T07:53:41.671948Z"
}
},
"outputs": [],
"source": [
"## RFM模型\n",
"# 根据 account 合并数据\n",
"rfm_df = recent_df.merge(frequency_df, on=\"account\", how=\"left\").merge(mount_df, on=\"account\", how=\"left\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "dc4af967",
"metadata": {
"ExecuteTime": {
"end_time": "2022-01-05T07:53:41.806587Z",
"start_time": "2022-01-05T07:53:41.794621Z"
}
},
"outputs": [],
"source": [
"rfm_df2 = rfm_df.copy()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "0c145514",
"metadata": {
"ExecuteTime": {
"end_time": "2022-01-05T07:53:41.883382Z",
"start_time": "2022-01-05T07:53:41.840497Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.0 0\n",
"0.2 2\n",
"0.4 4\n",
"0.6 6\n",
"0.8 8\n",
"1.0 13057\n",
"Name: point, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rfm_df2[\"point\"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "3d49d903",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>account</th>\n",
" <th>recent</th>\n",
" <th>point_uuid</th>\n",
" <th>point</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001611</td>\n",
" <td>464</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1005374</td>\n",
" <td>589</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1005407</td>\n",
" <td>624</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1010153</td>\n",
" <td>756</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1010528</td>\n",
" <td>467</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5881</th>\n",
" <td>9990197</td>\n",
" <td>809</td>\n",
" <td>1</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5882</th>\n",
" <td>9990991</td>\n",
" <td>573</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5883</th>\n",
" <td>9992257</td>\n",
" <td>666</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5884</th>\n",
" <td>9993826</td>\n",
" <td>498</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5885</th>\n",
" <td>9998323</td>\n",
" <td>734</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5886 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" account recent point_uuid point\n",
"0 1001611 464 1 4\n",
"1 1005374 589 1 7\n",
"2 1005407 624 1 5\n",
"3 1010153 756 1 10\n",
"4 1010528 467 1 7\n",
"... ... ... ... ...\n",
"5881 9990197 809 1 8\n",
"5882 9990991 573 1 4\n",
"5883 9992257 666 1 0\n",
"5884 9993826 498 1 0\n",
"5885 9998323 734 1 7\n",
"\n",
"[5886 rows x 4 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rfm_df2"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "605888d7",
"metadata": {},
"outputs": [],
"source": [
"# 根据分位数分层划分为5个等级\n",
"recent_labels = [5, 4, 3, 2, 1]\n",
"mount_labels = [1, 2, 3, 4, 5]\n",
"frequency_bins = [1, 3, 5, 12]\n",
"frequency_labels = [1, 2, 3]\n",
"\n",
"m_bins = rfm_df2[\"point\"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest')\n",
"r_bins = rfm_df2[\"recent\"].quantile(q=np.linspace(0, 1, num=6), interpolation='nearest')\n",
"rfm_df2[\"R\"] = pd.cut(rfm_df2[\"recent\"], bins=r_bins, labels=recent_labels, include_lowest=True)\n",
"rfm_df2[\"F\"] = pd.cut(rfm_df2[\"point_uuid\"], bins=frequency_bins, labels=frequency_labels, include_lowest=True)\n",
"rfm_df2[\"M\"] = pd.cut(rfm_df2[\"point\"], bins=m_bins, labels=mount_labels, include_lowest=True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "6dbd63f1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>account</th>\n",
" <th>R</th>\n",
" <th>F</th>\n",
" <th>M</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001611</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1005374</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1005407</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1010153</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1010528</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5881</th>\n",
" <td>9990197</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5882</th>\n",
" <td>9990991</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5883</th>\n",
" <td>9992257</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5884</th>\n",
" <td>9993826</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5885</th>\n",
" <td>9998323</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5886 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" account R F M\n",
"0 1001611 5 1 2\n",
"1 1005374 4 1 4\n",
"2 1005407 3 1 3\n",
"3 1010153 1 1 5\n",
"4 1010528 5 1 4\n",
"... ... .. .. ..\n",
"5881 9990197 1 1 4\n",
"5882 9990991 4 1 2\n",
"5883 9992257 3 1 1\n",
"5884 9993826 5 1 1\n",
"5885 9998323 2 1 4\n",
"\n",
"[5886 rows x 4 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 删除不需要的列\n",
"rfm_df2 = rfm_df2.drop(['recent', 'point_uuid', 'point'], axis=1)\n",
"rfm_df2"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c81fdc73",
"metadata": {
"ExecuteTime": {
"end_time": "2022-01-05T07:53:42.199538Z",
"start_time": "2022-01-05T07:53:42.187570Z"
}
},
"outputs": [],
"source": [
"rfm_df2 = rfm_df2.fillna(3)\n",
"rfm_df2 = rfm_df2.astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "8e19592c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>account</th>\n",
" <th>R</th>\n",
" <th>F</th>\n",
" <th>M</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1001611</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1005374</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1005407</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1010153</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1010528</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5881</th>\n",
" <td>9990197</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5882</th>\n",
" <td>9990991</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5883</th>\n",
" <td>9992257</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5884</th>\n",
" <td>9993826</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5885</th>\n",
" <td>9998323</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5886 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" account R F M\n",
"0 1001611 5 1 2\n",
"1 1005374 4 1 4\n",
"2 1005407 3 1 3\n",
"3 1010153 1 1 5\n",
"4 1010528 5 1 4\n",
"... ... .. .. ..\n",
"5881 9990197 1 1 4\n",
"5882 9990991 4 1 2\n",
"5883 9992257 3 1 1\n",
"5884 9993826 5 1 1\n",
"5885 9998323 2 1 4\n",
"\n",
"[5886 rows x 4 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rfm_df2"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "1a033585",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# 对数据进行提取选取我们需要的列\n",
"rfm = rfm_df2[['R', 'F', 'M']]\n",
"#使用lambda函数对数据进行归一化处理\n",
"normalization_rfm = rfm.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "1b20e949",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>R</th>\n",
" <th>F</th>\n",
" <th>M</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.00</td>\n",
" <td>0.0</td>\n",
" <td>0.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.75</td>\n",
" <td>0.0</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.50</td>\n",
" <td>0.0</td>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1.00</td>\n",
" <td>0.0</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5881</th>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5882</th>\n",
" <td>0.75</td>\n",
" <td>0.0</td>\n",
" <td>0.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5883</th>\n",
" <td>0.50</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5884</th>\n",
" <td>1.00</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5885</th>\n",
" <td>0.25</td>\n",
" <td>0.0</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5886 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" R F M\n",
"0 1.00 0.0 0.25\n",
"1 0.75 0.0 0.75\n",
"2 0.50 0.0 0.50\n",
"3 0.00 0.0 1.00\n",
"4 1.00 0.0 0.75\n",
"... ... ... ...\n",
"5881 0.00 0.0 0.75\n",
"5882 0.75 0.0 0.25\n",
"5883 0.50 0.0 0.00\n",
"5884 1.00 0.0 0.00\n",
"5885 0.25 0.0 0.75\n",
"\n",
"[5886 rows x 3 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"normalization_rfm = normalization_rfm.fillna(0)\n",
"normalization_rfm"
]
},
{
"cell_type": "markdown",
"id": "3a13399c",
"metadata": {},
"source": [
"$$\n",
"SSE = \\sum^{k}_{i=1}\\sum_{p\\in{C_i}}|p-m_i|^2\n",
"$$\n",
"$C_i$是第$i$个簇,$p$是$C_i$中的样本点,$m_i$是$C_i$的质心($C_i$中所有样本的均值),$SSE$是所有样本的聚类误差,代表了聚类效果的好坏。"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "422c054b",
"metadata": {},
"outputs": [],
"source": [
"clusters = [2, 3, 4, 5, 6, 7, 8]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "a0c1889b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[934.2103968347631, 597.3608447740422, 384.3004298136629, 317.9862632164778, 271.5227306631748, 231.6340074056924, 196.15821396905733]\n"
]
}
],
"source": [
"# 手肘法来确定最优K值\n",
"SSE = [] # 误差平方和\n",
"SC = [] # 轮廓系数\n",
"for i in clusters:\n",
" kmeans = KMeans(n_clusters=i)\n",
" kmeans.fit(normalization_rfm)\n",
" SSE.append(kmeans.inertia_)\n",
"print(SSE)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "a50606ef",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0, 0.5, 'SSE')"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEECAYAAAAlEzNMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAqyUlEQVR4nO3deXzU9b3v8dcsmSwzkwyEBKEhkCAhLMZEkd0oxyW21i0qSFqwR2sl9dSGWy1LNdiHHpHrrVJtPaBF7Y3FkAou154jUooGoUYKIohEJewkQNhnhjiZJHP/CMSgEjGZyWzv5+Ph4wET5pfPx9G8+X2X39fg8/l8iIhI1DIGuwAREQkuBYGISJRTEIiIRDkFgYhIlFMQiIhEOXOwC/iuWlpaaG7u3EInk8nQ6feGGvUSeiKlD1AvoaorvcTEmM76tbALguZmH8eOnezUex2OhE6/N9Sol9ATKX2AeglVXeklJcV+1q9paEhEJMopCEREopyCQEQkyikIRESinIJARCTKKQhERKKcgkBEJMpFTRAcOdnIf2+uQ0/dFhE5U9QEwUf7TvDLio9Ys+NIsEsREQkpURMEl2b2JLOXld+/u52m5pZglyMiEjKiJgjMJiMzrhnMziMNLNu0P9jliIiEjKgJAoAJWSlcku7g2bU7cX7RFOxyRERCQlQFgcFgoOSyTE580cTzVbuDXY6ISEiIqiAAyEq1cd3w3iz5cB97jzUEuxwRkaCLuiAAmDZuAGajgT+s3hHsUkREgi4g5xE0NjYya9Ys9uzZg81mo7S0FIPBwMyZMzEYDAwaNIg5c+ZgNBqpqKigvLwcs9lMcXExEyZMCERJZ0ixxTL1kn4sXLuLjXuPk5uWFPDvKSISqgISBBUVFSQkJFBRUcH27dt5+OGHiYmJoaSkhFGjRlFaWsrKlSvJzc2lrKyMpUuX4vF4KCoqYty4cVgslkCUdYYfj0jj1U11PPnudl4oysVoMAT8e4qIhKKABMG2bdvIz88HIDMzk5qaGpqbmxk5ciQA+fn5rFmzBqPRSF5eHhaLBYvFQnp6OtXV1eTk5Jz12iaTAYcjoVN1mUzGM957f8Fg7l+6mdW7j3PDhX07dc1g+Wov4SxSeomUPkC9hKpA9RKQIBgyZAirVq3iyiuv5KOPPuLAgQMkJydjOPW3bqvVitPpxOVyYbd/eXya1WrF5XJ1eG1/HlWZ39/BkN42/vdb1YzqayeugzM9Q42O3ws9kdIHqJdQFVZHVd58883YbDamTp3KqlWrGDZsGEbjl9/K7XaTmJiIzWbD7Xaf8Xr7YAg0o8FAyeWZHHQ1snj9vm77viIioSQgQbB582YuvvhiysrKuPLKK+nXrx9Dhw6lqqoKgMrKSkaMGEFOTg7r16/H4/HgdDqpqakhKysrECWd1UVpDiYM6sWLH+zmkLuxW7+3iEgoCMjQUP/+/fn973/P888/j91u5z//8z85efIkDz74IE888QSZmZkUFBRgMpmYMmUKRUVF+Hw+pk+fTmxsbCBK6tAvLs1gdc1hFqzZyQNXd28QiYgEm8EXZs9l9nqb/TZH0N6T79Tw8vp9vDTlIrJSbV0psVto3DP0REofoF5CVVjNEYSjO0enkxhnZv6723VmgYhEFQXBKYlxMdw1pj/rdh/TmQUiElUUBO3cfGEf0nvE68wCEYkqCoJ2zCYjv7wsk51HGnh1s84sEJHooCD4ikszezIi3cGza3fpzAIRiQoKgq84fWbB8QYvL+jMAhGJAgqCbzA41cYPh/WmXGcWiEgUUBCcRfH4AZgMBv6oMwtEJMIpCM4ixRbL1JH9+Ptnh/ho3/FglyMiEjAKgg78eEQaqTYLT76znRZtMhORCKUg6EB8jImfj89gy34nb1fXB7scEZGAUBB8i+8PTSU71cYfVu/gC29zsMsREfE7BcG3OH1mwQGnh5c36MwCEYk8CoJzcHE/B5efn8yLVXt0ZoGIRBwFwTn6RX4mjc0tLFyzM9iliIj4lYLgHKX3iGdiXl/e+Hg/n9d3fK6yiEg4URB8B3eOTscea2b+OzqzQEQih4LgO0iMi+GnY/rzwe5jrN1xNNjliIj4hYLgO7ql/ZkFLborEJHwF5Ag8Hq9/OpXv+K2226jqKiImpoadu3axeTJkykqKmLOnDm0tLQe/FJRUUFhYSETJ05k1apVgSjHr8wmI/fmZ7LjyEle21QX7HJERLrMHIiLvvvuuzQ1NVFeXs6aNWuYP38+Xq+XkpISRo0aRWlpKStXriQ3N5eysjKWLl2Kx+OhqKiIcePGYbFYAlGW3+QP7MmIfkksXLuLa4akYosNyL9GEZFuEZCfYBkZGTQ3N9PS0oLL5cJsNrNx40ZGjhwJQH5+PmvWrMFoNJKXl4fFYsFisZCenk51dTU5OTlnvbbJZMDhSOhUXSaTsdPv/aoHfjiMmxasZfHGOn5dMNgv1/wu/NlLsEVKL5HSB6iXUBWoXgISBAkJCezbt4/vf//7HD16lAULFrBu3ToMBgMAVqsVp9OJy+XCbre3vc9qteJydbw0s7nZx7FjJztVl8OR0On3ftX3EsxcO7Q3L/5zJ9dm9+J7SfF+ue658mcvwRYpvURKH6BeQlVXeklJsZ/1awGZI3jxxRcZP348y5cv5/XXX2fmzJl4vd62r7vdbhITE7HZbLjd7jNebx8Moa54XOuZBX+o3BnsUkREOi0gQZCYmNj2Az0pKYmmpiaGDh1KVVUVAJWVlYwYMYKcnBzWr1+Px+PB6XRSU1NDVlZWIEoKiFR7LFMv6cffP6vXmQUiErYMvgDsjHK73cyePZv6+nq8Xi9Tp05l+PDhPPjgg3i9XjIzM3nkkUcwmUxUVFSwZMkSfD4fd999NwUFBR1e2+ttDomhodMavM3c/Pw6Um2xPF+Ui/HU8Feg6XY39ERKH6BeQlWghoYCEgSBFGpBAPDmlv389q3PeOQH2RQMSfX79b+J/uMOPZHSB6iXUBVWcwTR5gdDe+vMAhEJWwoCPzh9ZsF+nVkgImFIQeAn7c8sOKwzC0QkjCgI/OgX+Zl4mltYuHZnsEsRETlnCgI/Su8Rz8Tcvry+eT/b6t3f/gYRkRCgIPCzO0enY4s1M//dGp1ZICJhQUHgZ0nxrWcWVO06xtqdOrNAREKfgiAA2s4seEdnFohI6FMQBECMyci9+Rk6s0BEwoKCIEDyByZzcb8knl27C5enKdjliIiclYIgQAwGAyWXZXKswcsLVXuCXY6IyFkpCAIou7edHwzrzcsb9rLveEOwyxER+UYKggD7+bgBGA0G/rh6Z7BLERH5RgqCAGs9syCNFZ/Ws6n2RLDLERH5GgVBN5hyST96WS08+Y42mYlI6FEQdIP4GBM/Hz+Aj+ucrPi0PtjliIicQUHQTa4d1pvBqTaertSZBSISWhQE3cR4ajnpfqeHcp1ZICIhxByIiy5btoxXX30VAI/Hw9atW1m8eDGPPvooBoOBQYMGMWfOHIxGIxUVFZSXl2M2mykuLmbChAmBKCkkjEh3cNnAZF78YA/XDT+PZKsl2CWJiATmjqCwsJCysjLKysoYNmwYDzzwAH/84x8pKSlh8eLF+Hw+Vq5cSX19PWVlZZSXl7No0SKeeOIJGhsj+1CXX+Rn8EVTC8+u3RXsUkREgAAPDW3evJlt27YxadIktmzZwsiRIwHIz89n7dq1bNq0iby8PCwWC3a7nfT0dKqrqwNZUtD175nArbl9eW1zHdsO6cwCEQm+gAwNnbZw4ULuueceAHw+HwaDAQCr1YrT6cTlcmG329v+vNVqxeVydXhNk8mAw5HQqXpMJmOn3+tPvyoYzP9sPcgza3bx/O0jOnWNUOnFHyKll0jpA9RLqApULwELghMnTrB9+3ZGjx4NgNH45c2H2+0mMTERm82G2+0+4/X2wfBNmpt9HDt2slM1ORwJnX6vv90xqh9PvrOd//5wL2Mzen7n94dSL10VKb1ESh+gXkJVV3pJSTn7z9aADQ2tW7eOsWPHtv1+6NChVFVVAVBZWcmIESPIyclh/fr1eDwenE4nNTU1ZGVlBaqkkHJrbl/6OeKY/67
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.lineplot(x=clusters, y=SSE)\n",
"plt.xlabel('K')\n",
"plt.ylabel('SSE')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "714adb7d",
"metadata": {},
"outputs": [],
"source": [
"# 计算曲率\n",
"x_t = np.gradient(clusters)\n",
"y_t = np.gradient(SSE)\n",
"\n",
"vel = np.array([[x_t[i], y_t[i]] for i in range(x_t.size)])\n",
"\n",
"speed = np.sqrt(x_t * x_t + y_t * y_t)\n",
"\n",
"tangent = np.array([1 / speed] * 2).transpose() * vel\n",
"\n",
"tangent_x = tangent[:, 0]\n",
"tangent_y = tangent[:, 1]\n",
"\n",
"d_tangent_x = np.gradient(tangent_x)\n",
"d_tangent_y = np.gradient(tangent_y)\n",
"\n",
"dT_dt = np.array([[d_tangent_x[i], d_tangent_y[i]] for i in range(d_tangent_x.size)])\n",
"\n",
"length_dT_dt = np.sqrt(d_tangent_x * d_tangent_x + d_tangent_y * d_tangent_y)\n",
"\n",
"normal = np.array([1 / length_dT_dt] * 2).transpose() * dT_dt\n",
"\n",
"d2s_dt2 = np.gradient(speed)\n",
"d2x_dt2 = np.gradient(x_t)\n",
"d2y_dt2 = np.gradient(y_t)\n",
"\n",
"curvature = np.abs(d2x_dt2 * y_t - x_t * d2y_dt2) / (x_t * x_t + y_t * y_t) ** 1.5\n",
"t_component = np.array([d2s_dt2] * 2).transpose()\n",
"n_component = np.array([curvature * speed * speed] * 2).transpose()\n",
"\n",
"acceleration = t_component * tangent + n_component * normal"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "eadb907c",
"metadata": {},
"outputs": [],
"source": [
"normalization_acceleration = acceleration\n",
"normalization_acceleration[:, 1] = (acceleration[:, 1] - np.min(acceleration[:, 1]))/(np.max(acceleration[:, 1]) - np.min(acceleration[:,1]))\n",
"normalization_curvature = normalization_acceleration[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b2d383e8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0, 0.5, 'normalization curvature')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEECAYAAADAoTRlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAvAElEQVR4nO3deVzUdf4H8NcwwzEwwIByyQACCpJaCJgnmakV2qVkeGGZ1XY93Mpfm7bJZroeuWttuh7rbrZ5YmUqnptHYdS6QmKigCJKcYOCOFxz/v5AJy1xcGDmO8fr+Xj4iJkvM/P6PIbe3++85/v9fER6vV4PIiJyKE5CByAiIstj8ScickAs/kREDojFn4jIAbH4ExE5IInQATpCp9NBqzXtpCSxWGTyY60Nx2Kd7GUs9jIOgGO5ztlZ3O42myj+Wq0e9fVNJj1WLnc3+bHWhmOxTvYyFnsZB8CxXOfn59nuNrZ9iIgcEIs/EZEDYvEnInJALP5ERA6IxZ+IyAGx+BMROSCzFf+TJ08iNTX1N/cfPnwYycnJSElJwbZt28z18kREdBtmOc9/3bp12LVrF6RS6U33q9VqLF68GJ9//jmkUikmT56MkSNHws/PzxwxyAo1tKhx+GwthkT5I8Ct/QtQiMi8zFL8Q0NDsWLFCvzhD3+46f7z588jNDQU3t7eAID4+HhkZ2cjKSnpts8nFosgl7ublEUsdjL5sdbGVsei0+nxv4uX8VlOKfafqYJKo8NdeVXY8dIQiEQioeN1mq2+L79mL+MAOJaOMEvxf+ihh1BaWvqb+5VKJTw9f7nizMPDA0ql0ujz8QrfNrY2lhplK3afrsKuvEqU1rdA5irG4/0C4ekqxsfHfsbBU+UYGOojdMxOs7X3pT32Mg6AY7nudlf4WnR6B5lMhsbGRsPtxsbGm3YGZPs0Oj2yii9j56kKfHfhMrR6ID7EG88PCcMDvbvDzVmMVo0OO/OqsCm7zC6KP5Etsmjxj4yMRElJCerr6+Hu7o7s7GzMnDnTkhHITH6ua8auvErsPl2F2kYVunm4YNrAEDzWLxChPjd/9+MqccK0QaH42+EiFF9qREQ3D4FSEzkuixT/jIwMNDU1ISUlBXPmzMHMmTOh1+uRnJyMgIAAS0QgM2hRa3GkqBY7T1Ui5+crcBIBw8J98Xj/IAwL94FE3P7JZFPuDcWazGJszi7DOw9FWTA1EQGAyBYWcFertez5w3rGUlitxM5TldifX42rrRoEe7vh8f6BGHdXAPw9XTv0HHK5O+Z8fhK78iqR8fwgdPNwMXNq87GW96Wz7GUcAMdyndX0/Ml2KVs12J9fjV15lcivUsJFLMLI3t3xRP8gxIV4w8mEs3amxCuw/WQFPsstx4vDenZ9aCJqF4s/tUuv1yO3rAE7T1Xg4NlatGp06O3ngTcfiMRDffzhLXXu1POH+khxX2Q3fJ5bjmfuDYHbbRaeIKKuxeJPv3GpUYU9p6uwM68SP9U1w8NFjEf6BuCxfoGICZB16bn50xIU+Ob8Jew+XYUnY3t02fMS0e2x+BOAtlM0/3vxMnaeqsTR4svQ6vQYEOyFGYNCMCrKD1IzHZXfE+yFvoGe2PJDGSbcE2RS+4iI7hyLv4Mru9KMXXlV2J1XiWqlCr7uzpgSF4zH+gWiZzfzXyEpEokwNUGBt3fn4+j5SxjRq7vZX5OIWPwdUqtGh2+KarHjVCWO/1QPJxEwpKcv/u+BQCRG+N72FE1zGNm7O3p4uWJjdimLP5GFsPg7kKKaRuw4VYH9+dW40qJBDy9X/G5oGB7pG4BALzfBckmcRJgUr8DyI+dxuqIBfYO8BMtC5ChY/O2cslWD/xTWYNepSpyuvApnsQj39+qOx/sHYmCo3Gp67I/1C8A/vruIjdllWPwoiz+RubH42yG9Xo8fyxuw81QlviqsQYtGh4hu7nhjZCSSYvwh7+Qpmubg4SLBhLuDsDG7FGVXeiLYW2r8QURkMhZ/O1LXpMKeM9XYeaoCFy83w91ZjIdi/PFE/0D0DfS0+umTUwYEY1NOGbb+UI7ZIyOFjkNk11j8bZxWp8exkjrsyqvEN0WXoNHp0T/IC/MeDMHoaD+4u9jOhVP+nq54qI8fdp2qxAtDwuDpxj9PInPh/102qqKhBRl5ldiVV4Wqq62QS53x1IAeeLx/oE3Pkjk1XoG9Z6rx5Y8VmH5viNBxiOwWi78NadXocLCwBjtPVeJYSR0AYFBPH7w2IgL3RXaDi8Syp2iaQ5S/DAND5dh6ogyT44PhbOHTTokcBYu/jdiXX4UPvi5GXZMaAZ6ueG5IKB7tF4ggAU/RNJdpCQr8fnseviqswdi7OOU3kTmw+NuAgqqrWHDgLPr28ML8pBDcG+oDsZN1f3nbGUN6+iCimzs2ZpciKcbf6r+oJrJF/Ext5ZStGszdnQ8fqTPWTo3HkJ6+dl34gWtTPsQrcK6mEcd/qhc6DpFdYvG3Ynq9Hn/+zzlUXGnBokdi4GvDC57cqYdj/OHr7oyN2aVCRyGySyz+VuyLkxU4eLYGLw8Pxz3B3kLHsSgXiRNSBgTj+4t1OF/bKHQcIrvD4m+lCquUWP71eQwN98G0gQqh4whiwj1BcJU4YXMOj/6JuhqLvxVStmowZ/cZ+EidMf/hPlYz/46lyaXOeLRvAPblV6O2USV0HCK7wuJvZfR6PRZ91dbn//O4GMjdrW8eHkuaEq+ARqvHZyfKhI5CZFdY/K3M9h8r8FVhDV4aHo5YhWP1+W8lxEeKEb264YuTFWhWa4WOQ2Q3WPytSGGVEsuPtPX5Ux20z38r0xIUuNKiwe7TVUJHIbIbLP5Wou18/jOQS53x7sPRDtvnv5W7e3ihX5AnNueUQqvTCx2HyC6w+FuB633+8mt9fh93xzmfvyNEIhGmJShQWt+CzPOXhI5DZBdY/K3Al9f6/C8O68k+fzvu79UdPbzdsIkXfRF1CRZ/gRVWK/HXI+cxpKcPpzC+DbGTCJPjgnGyvAGnyhuEjkNk81j8BdSo0uDt3fnwljpjfhL7/MY81i8Qnq4SbOJFX0SdxuIvEL1ej8VfnUNpfTP7/B3k7iLGhHuCcORcLcquNAsdh8imsfgL5MtTlThQ0NbnH8A+f4elDOgBJ5EIW3J40RdRZ7D4C+BstRJ/PVyEwT198DT7/HfET3Ztnd+8SjS0qIWOQ2SzzFL8dTod0tLSkJKSgtTUVJSUlNy0fdeuXRg/fjySk5OxefNmc0SwWo2qtvn52ec33dQEBZrVOmw/WSF0FCKbZZbif/DgQahUKqSnp2P27NlYsmTJTdvff/99rF+/Hlu2bMH69etx5coVc8SwOjf2+ReO6wNf9vlN0ttPhkFhcmzLLYdaqxM6DpFNMkvxz8nJQWJiIgAgNjYWeXl5N22Pjo7G1atXoVKpoNfrHWaZvh3X+vy/G9oTcQq50HFs2tQEBWqUKvynoEboKEQ2ySxr+CqVSshkMsNtsVgMjUYDiaTt5Xr37o3k5GRIpVKMGTMGXl5et30+sVgEudzdpCxisZPJj+1K+RUN+MuR8xgW2Q2vPRgNJxOWYrSWsXSFzo7l4XukiDp6EVtOlGHK0J6CHkDYy/tiL+MAOJaOMEvxl8lkaGz8ZfUlnU5nKPwFBQX4+uuvcejQIbi7u+PNN9/Evn37kJSU1O7zabV61Nc3mZRFLnc3+bFdpVGlwatbTsDLVYK0B3ujocG00xStYSxdpSvGMmlAD7x34Cz+c7Icg3r6dFGyO2cv74u9jAPgWK7z8/Nsd5tZ2j5xcXHIzMwEAOTm5iIqKsqwzdPTE25ubnB1dYVYLIavry8aGuz3ik32+c3noT7+6Obhgo286IvojpnlyH/MmDHIysrCpEmT2iYtW7QIGRkZaGpqQkpKClJSUjBlyhQ4OzsjNDQU48ePN0cMq7DTcD5/GOJD5ELHsStt6/z2wKpvL6KothG9unsIHYnIZoj0er3Vz5GrVmttsu1zrkaJGZtzERvshb9N6A+xCX3+G/Gj7G9daVbjkX8cw+hoP/zp4eguSHbn7OV9sZdxABzLdRZv+1Bbn39ORj48XSV4b2y
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.lineplot(x=clusters, y=normalization_curvature)\n",
"plt.xlabel('K')\n",
"plt.ylabel('normalization curvature')"
]
},
{
"cell_type": "markdown",
"id": "08948ffb",
"metadata": {},
"source": [
"$$\n",
"S = \\frac{b - a}{\\max{(a, b)}}\n",
"$$\n",
"$S$样本点$X_i$的轮廓系数,$a$是$X_i$与同簇的其他样本的平均距离,称为凝聚度,$b$是$X_i$与最近簇中所有样本的平均距离,称为分离度。\n",
"$$\n",
"C_j = \\arg{\\min_{C_i}\\frac{1}{n}\\sum_{p\\in{C_i}}|p-X_i|^2}\n",
"$$\n",
"$C_j$是最近簇其中p是某个簇$C_k$中的样本。事实上,简单点讲,就是用$X_i$到某个簇所有样本平均距离作为衡量该点到该簇的距离后,选择离$X_i$最近的一个簇作为最近簇。求出所有样本的轮廓系数后再求平均值就得到了平均轮廓系数。平均轮廓系数的取值范围为[-1,1]且簇内样本的距离越近簇间样本距离越远平均轮廓系数越大聚类效果越好。那么很自然地平均轮廓系数最大的k便是最佳聚类数。"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "321eceaf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.3926532652562246, 0.4165845111425452, 0.45642014396553005, 0.4581536763935582, 0.46087941918329456, 0.4601172904178992, 0.4810293417489168]\n"
]
}
],
"source": [
"# 轮廓系数来确定最优K值数据量大时效率不高\n",
"SCORE = []\n",
"for i in clusters:\n",
" kmeans = KMeans(n_clusters=i)\n",
" result = kmeans.fit_predict(normalization_rfm)\n",
" score = silhouette_score(normalization_rfm, result)\n",
" SCORE.append(score)\n",
"print(SCORE)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "dbf44f9e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0, 0.5, 'SCORE')"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEECAYAAAAoDUMLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAmRklEQVR4nO3de1xUZf4H8M8wwwyXEcYLKqYioFjeMsvKklUzM00t8QK6G7m2sZlmmVZm4U/TRc3WzEhRd81ftgW/FUrJxHLzSmXmgoopKBcvgIDIxYEZ5nLO7w9wklTUgZkzl8/79fLFzJlzmO8jcD7n8szzyERRFEFERG7PQ+oCiIjIMTAQiIgIAAOBiIgaMBCIiAgAA4GIiBoopC6gOQRBgNlsXScpuVxm9baOxlXa4irtANgWR+UqbWluOzw95Tdc7tSBYDaLqKystWpbjcbH6m0djau0xVXaAbAtjspV2tLcdgQEtLrhcl4yIiIiAAwEIiJqwEAgIiIADAQiImrAQCAiIgAMBCIiasBAICIiAAwEIiKncrnWgG1Hi2CLmQts8sE0QRCwaNEiZGdnQ6lUYunSpQgKCrpuvdjYWPj7+2PevHkwGo2YP38+CgsL4eHhgSVLliA0NNQW5REROaXCKh1e3nocl2uNSHvxYXjd5BPH1rLJGcLu3bthMBiQlJSEuXPnYvny5detk5iYiJycHMvzffv2wWQyITExETNnzsTq1attURoRkVM6c6kGf/niKKr0Jnzy3AMtHgaAjQLhyJEjCA8PBwD0798fWVlZjV7PyMjA0aNHERkZaVkWHBwMs9kMQRCg1WqhUDj1qBpERC3mWFE1/pp0FACwPvJe3Ne1tU3exyZ7Xa1WC7VabXkul8thMpmgUChQWlqK+Ph4xMfHY+fOnZZ1fHx8UFhYiFGjRqGiogIJCQm3fB+5XAaNxseqGuVyD6u3dTSu0hZXaQfAtjgqZ2zL/tNlmLX1ONq3UuGTaQ+gS2sfm7XDJoGgVqtRU1NjeS4IguWIPy0tDRUVFYiJiUFZWRn0ej1CQkKQnZ2NwYMHY+7cuSguLsZzzz2H1NRUqFSqm74PB7er5yptcZV2AGyLo3K2tnx7qhQLd2YjtK0P1kzoi1YyoLKy1rkGtxswYAD2798PAMjMzERYWJjltejoaKSkpGDLli2IiYnBmDFjEBERAT8/P7RqVV+kv78/TCYTzGazLcojInJ4/84swjs7TqFfJz+sj7wXbX2VNn9Pm5whjBgxAunp6YiKioIoioiLi0Nqaipqa2sb3Te41rRp07BgwQJMnToVRqMRc+bMgY+Pc53aERE1lyiK+MdP57Dhh7MID2mDuDH32OQG8o3IRFt0ZrUTo9HMS0Zwnba4SjsAtsVROXpbBFHEqj25SMoowlO92uOdkT2h8JBdt56tLhmxKw8RkQMwmQUs3pWDtJOlmHr/XXhlSAg8ZNeHgS0xEIiIJKY3mjE/9STS8y/jpcHdMO3BLpDZOQwABgIRkaSu6E2Y82UWjhVV460RPRDRL1CyWhgIREQSuaStw+yULOSX1yJuzD14vGeApPUwEIiIJHChUodZW4/jcq0Bq8f3wUPdbPPp4zvBQCAisrPTZVq8nJwFk1nAukn90DvQT+qSAHD4ayIiu8q8UIWYpKOQy4ANUfc6TBgAPEMgIrKb9LzLeDP1V3RopUL8xL4I9POSuqRGGAhERHaw82QJFqfloEc7X3w4oQ/a+Nh+KIo7xUAgIrKxpP8W4v09ubi/iz/ef7o31CrH3PU6ZlVERC5AFEVs+OEs/vHTOQzt3hZLn7oHKoXj3rplIBAR2YAgilj5nzPYerQYY3t3wIInwm44LpEjYSAQEbUwo1nAop3Z+Da7DM8+0Bkv/yFYkqEo7hQDgYioBemMZry5/Vf8WFCBl8ODEf1gF6lLum0MBCKiFlKlM2LOlydw4mI13h7RA89IOC6RNRgIREQtoExbh5eTj+NchQ7LxvbCYz3aSV3SHWMgEBE10/kKHWZtPYZKnQkfRvTBwK7Sj0tkDQYCEVEzZJdqMTv5OAQRWDe5H3p1vPFsZM6AgUBEZKX/XqjEa1+egFqlQPyEvujW1rnngWcgEBFZYX9uORZ8fRKBfip8NKEvOjrYuETWYCAQEd2hHSdKsGRXNnp2aIUPx/eBxsdT6pJaBAOBiOgOfH7kAj7Ym4eBXTVY+XQv+CpdZzfqOi0hIrIhURSRkF6ATYfOY1iPdlg6+m4oHXhcImswEIiIbsEsiHjvP2eQcqwYz/TtiPmP94DcwcclsgYDgYioCQaTgP/ZeQq7cy7huQe7YObgbk4xLpE1GAhERDdRazDjje0ncOhsJV4ZEoI/PdBZ6pJsioFARHQDlToj5nyZhZMXr2DhyDCM7dNR6pJsjoFARPQ7JVfq8PLW4yis0mHFuF4Y0t35xiWyBgOBiOgaZy/XYtbW47hSZ8KaCX1xfxeN1CXZDQOBiKjBqZIrmJ2cBQBImNwPd3dw3nGJrMFAICICcOR8JeZ+dQKtVArET+yLoDbOPS6RNRgIROT29p6+hLd3nMRdGm98NKEvOrRSSV2SJBgIROTWtmddxN++zUGvjq3wwfg+0Hi7xrhE1mAgEJHb+uyXC/hwXx4eCtLgvXG94aOUS12SpBgIROR2RFHExwcL8L8/n8fjYQFYPKqny41LZA0GAhG5FbMgYtnu09h2/CIi+gXijeHdXXJcImswEIiokTqTgIvVelysrsPFK3p4eCpgNpig8vSASiGHSu4BlcIDSkX912v/KeUe8FJ4QCF3zKPtOpOAt74+iT2nL2H6w13x4iNBLjsukTVsEgiCIGDRokXIzs6GUqnE0qVLERQUdN16sbGx8Pf3x7x58wAA69evx/fffw+j0YgpU6Zg0qRJtiiPyG2JoogrdSYUV9fhYrW+4Wv9jv/qssu1xma/j1yGhsCQQymXwctTbgmMq2HidU2INF4mbxw2vwsgL8vjhu+tkEPlWf99mjrSrzGY8MqWI/ghrxxzhoZg6v2uPS6RNWwSCLt374bBYEBSUhIyMzOxfPlyrFu3rtE6iYmJyMnJwcCBAwEAhw4dQkZGBr744gvodDps2rTJFqURuTSzIKK8xoDihiP84mo9Ll6p++1xdR1qjeZG26gUHujYSoVAPy/0CG2LQL/6xx0alnVop0ZZuRZ1JsHyz2AWoDcJMFiWmVFnEhu+1r9+7fp1165rFlCtN6HMZL5m+9/+ic1ov8JDdt0Zy9XH5bVGlFypw+JRPTG6V4fm/Ue7KJsEwpEjRxAeHg4A6N+/P7Kyshq9npGRgaNHjyIyMhJ5eXkAgIMHDyIsLAwzZ86EVqvFG2+8ccv3kctl0Gis+/CIXO5h9baOxlXa4irtAGzXljqjGcXVehRW6lBUqUdRpQ6FVfWPCyt1KKnWw2huvEvVeHuik8YbIQFqDO4RgLs0Xuik8cZdGm908vdCG19lk5dN5HIPtPVVtnhbbkQURRjNvwWL3miG3nht6Py2zGASoL9mWZ2xfh391XCxbFu/zNfLE+8+3QdDejj/uES2+v2ySSBotVqo1WrLc7lcDpPJBIVCgdLSUsTHxyM+Ph47d+60rFNRUYGioiIkJCTgwoULmDFjBtLS0pr8RTWbRVRW1lpVo0bjY/W2jsZV2uIq7QCsb8sVvQnF1b9dvqk/uq9/XnyDyzkeMqCdrxKBfl7o3UGN4T3aIdBPhY5+Xpaj/ia7UprMqKrS2aQtzaUAoJYBaqUHoPRAS+yuXOV3rLntCAi48ZAcNgkEtVqNmpoay3NBEKBQ1L9VWloaKioqEBMTg7KyMuj1eoSEhECj0SAkJARKpRIhISFQqVS4fPky2rZta4sSiexOEK9ezqmz3LS9eknn6uWcGsP1l3PqL92oEB7SFh0bLud09FOho58KHdQqh72BS87HJoEwYMAA7NmzB6NHj0ZmZibCwsIsr0VHRyM6OhoAkJKSgry8PERERGDPnj349NNP8ec//xmlpaXQ6XTQaDS2KI+oRYmiiBqDGZU6I6p0RlTqTKgVLyO/pLrRzduSK3UwCY0v5/h5KdCxlQqd/b3xQBc
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.lineplot(x=clusters, y=SCORE)\n",
"plt.xlabel('K')\n",
"plt.ylabel('SCORE')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "85850690",
"metadata": {},
"outputs": [],
"source": [
"tup_list = []\n",
"for i in range(7):\n",
" item = (clusters[i], normalization_curvature[i] * 0.1 + SCORE[i] * 0.9)\n",
" tup_list.append(item)\n",
"tup_list = sorted(tup_list, key=lambda x: x[1], reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "7ce62d18",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"clf_KMeans聚类中心\n",
" [[ 7.54696970e-01 -1.38777878e-17 9.34848485e-02]\n",
" [ 7.45000000e-01 1.47368421e-02 7.53289474e-01]\n",
" [ 1.28099174e-01 8.67361738e-19 9.94031221e-02]\n",
" [ 1.22093023e-01 -1.73472348e-18 7.59222133e-01]]\n"
]
}
],
"source": [
"# 模型训练。得到预测值。\n",
"k = tup_list[0][0]\n",
"k_model = KMeans(n_clusters=k)\n",
"k_model.fit(normalization_rfm)\n",
"print(\"clf_KMeans聚类中心\\n\", k_model.cluster_centers_)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "e5365af7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cluster2聚类数量\n",
" 1 1900\n",
"0 1650\n",
"3 1247\n",
"2 1089\n",
"dtype: int64\n"
]
}
],
"source": [
"quantity = pd.Series(k_model.labels_).value_counts()\n",
"print(\"cluster2聚类数量\\n\", quantity)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "592e7328",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"UPDATE `sys_user` SET `persona` = 1 WHERE `account` IN ('1234567', '1951119', '1951122', '1951123', '1951124', '1951125', '1951126', '1951127', '1951128', '1951129', '1951130', '1951131', '1951132', '1951133', '1951134', '1951135', '2051101', '2051102', '2051103', '2051104', '2051105', '2051106', '2051107', '2051108', '2051109', '2051110', '2051111', '2052520')\n"
]
}
],
"source": [
"# 获取聚类之后每个聚类中心的数据\n",
"res0Series = pd.Series(k_model.labels_)\n",
"message = [\"\", \"\", \"据分析,您近期的表现不佳,请注意!\", \"据分析,您近期的表现存在严重问题,请及时整改!\"]\n",
"query_sql = \"SELECT `account` FROM `sys_user`\"\n",
"query_data = pd.read_sql(query_sql, db_engine)\n",
"for i in range(0, k):\n",
" res0 = res0Series[res0Series.values == i]\n",
" temp_list = []\n",
" for account in rfm_df2['account'].iloc[res0.index].values:\n",
" if str(account) in query_data[\"account\"].values:\n",
" temp_list.append(str(account))\n",
" if i >= 2:\n",
" insert_sql = \"INSERT INTO `sys_message` (`account`, `title`, `content`, `create_user`, `create_time`) VALUES ('\" + str(account) + \"', '分析结果提醒', '\" + message[i] + \"', 'admin', '\" + str(time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime())) + \"')\"\n",
" db_engine.execute(insert_sql)\n",
" if len(temp_list) > 0:\n",
" update_sql = \"UPDATE `sys_user` SET `persona` = \" + str(i) + \" WHERE `account` IN (\" + str(temp_list).replace(\"[\", \"\").replace(\"]\", \"\") + \")\"\n",
" db_engine.execute(update_sql)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "06dd4a51",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.13"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 5
}