Commit b3206578 by 柴鹏飞

增加精排模型

parent 31041a93
# -*- coding: utf-8 -*-
import json
import random
from ydl_ai_recommender.src.core.manager import (
CounselorProfileManager,
UserEventManager,
)
from ydl_ai_recommender.src.core.profile import encode_user_profile
from ydl_ai_recommender.src.core.counselor_profile import encode_counselor_profile
def make_data(start_date, end_date, type):
# 平台用户数量太多,仅处理与本次任务相关的用户
manager = UserEventManager()
user_profiles = manager.load_xlsx_data(f'expose_click_user_{start_date}_{end_date}.xlsx')
user_embeddings = {}
for _, profile in user_profiles.iterrows():
user_embeddings[str(profile['uid'])] = encode_user_profile(profile)
counselor_profile_manager = CounselorProfileManager()
counselor_profiles = counselor_profile_manager.load_xlsx_data('all_counselor_profile.xlsx')
counselor_embeddings = {}
for _, profile in counselor_profiles.iterrows():
counselor_embeddings[str(profile['doctor_id'])] = encode_counselor_profile(profile)
# 曝光点击数据
train_data = []
expost_click = manager.load_data(start_date=start_date, end_date=end_date)
for _, profile in expost_click.iterrows():
uid = str(profile['user_ids'])
if ',' in uid:
continue
if uid not in user_embeddings:
continue
expose_ids = set(str(profile['ae_doctor_ids']).split(','))
click_ids = set(str(profile['doctor_card_doctor_ids']).split(',') + str(profile['private_chat_doctor_ids']).split(','))
click_ids2 = [_id for _id in click_ids if _id in counselor_embeddings]
for _id in click_ids2:
train_data.append([user_embeddings[uid] + counselor_embeddings[_id], 1])
uninterested_ids = list(expose_ids - click_ids)
for _id in random.choices(uninterested_ids, k=min(len(click_ids2), len(uninterested_ids))):
if _id not in counselor_embeddings:
continue
train_data.append([user_embeddings[uid] + counselor_embeddings[_id], 0])
print('data count: ', len(train_data))
print('label=0\t{}'.format(len(list(filter(lambda x: x[1] == 0, train_data)))))
print('label=1\t{}'.format(len(list(filter(lambda x: x[1] == 1, train_data)))))
with open(f'data/all_{type}.json', 'w') as f:
json.dump(train_data, f)
def make_train_data():
start_date = '2023-01-01'
end_date = '2023-02-01'
make_data(start_date, end_date, 'train')
def make_test_data():
start_date = '2023-02-01'
end_date = '2023-02-10'
make_data(start_date, end_date, 'test')
if __name__ == '__main__':
make_train_data()
make_test_data()
\ No newline at end of file
# -*- coding: utf-8 -*-
import os
import pickle
from typing import List
from ydl_ai_recommender.src.utils.log import create_logger
from ydl_ai_recommender.src.utils import get_conf_path, get_model_path
from ydl_ai_recommender.src.data.mysql_client import MySQLClientPool
from ydl_ai_recommender.src.core.manager import CounselorProfileManager
from ydl_ai_recommender.src.core.profile import encode_user_profile
from ydl_ai_recommender.src.core.counselor_profile import encode_counselor_profile
class Ranker():
def __init__(self, is_use_db=True) -> None:
self.logger = create_logger(__name__, 'ranker.log')
if is_use_db:
self.client = MySQLClientPool.create_from_config_file(get_conf_path())
else:
self.client = None
self.logger.warn('未连接数据库')
self.select_items_str = ', '.join([
'uid', 'country_code', 'channel_id_type', 'ffrom_login', 'user_preference_cate', 'consult_pay_money',
'listen_pay_money', 'test_items_pay_money', 'course_pay_money', 'consult_order_num',
'listen_order_num', 'test_items_order_num', 'course_order_num', 'aidi_cst_bias_city',
'aidi_cst_bias_sex', 'aidi_cst_bias_price', 'aidi_cst_bias_server_type', 'user_login_city',
'd30_inquire_order_num', 'd30_session_num',
])
# 用户画像embedding维度,后期需优化
self.user_embedding_dimension = 98
self.counselor_embeddings_dimension = 171
self.counselor_embeddings = self._load_counselor_embeddings()
model_path = os.path.join(get_model_path(), 'ranker', 'LR_v1.1.pkl')
with open(model_path, 'rb') as f:
self.model = pickle.load(f)
def _load_counselor_embeddings(self):
counselor_profile_manager = CounselorProfileManager()
counselor_profiles = counselor_profile_manager.load_xlsx_data('all_counselor_profile.xlsx')
counselor_embeddings = {}
for _, profile in counselor_profiles.iterrows():
counselor_embeddings[str(profile['doctor_id'])] = encode_counselor_profile(profile)
return counselor_embeddings
def get_user_profile(self, user_id):
if user_id == '0':
return None
sql = 'SELECT {} FROM ads.ads_register_user_profiles'.format(self.select_items_str)
sql += ' WHERE uid={}'.format(user_id)
try:
_, all_data = self.client.query(sql)
if len(all_data) == 0:
return []
return all_data[0]
except Exception as e:
self.logger.error('获取用户画像数据失败', exc_info=True)
return None
def default_user_embedding(self):
return [.0] * self.user_embedding_dimension
def get_user_embedding(self, user_id):
user_profile = self.get_user_profile(user_id)
if not user_profile:
return self.default_user_embedding()
user_embedding = encode_user_profile(user_profile)
return user_embedding
def default_counselor_embedding(self):
return [.0] * self.counselor_embeddings_dimension
def get_counselor_embeddings(self, counselors: List):
return [
self.counselor_embeddings.get(counselor, self.default_counselor_embedding())
for counselor in counselors
]
def compute_score(self, user_embedding, counselor_embeddings):
model_input = [
user_embedding + counselor_embedding
for counselor_embedding in counselor_embeddings
]
result = self.model.predict_proba(model_input)
return map(lambda x: x[1] * 100, result)
def _rank(self, counselors, user_embedding, counselor_embeddings):
""" 为了方便测试,精排流程拆分为两个函数 """
scores = self.compute_score(user_embedding, counselor_embeddings)
ret = []
for score, counselor in sorted(zip(scores, counselors), reverse=True):
ret.append({
'counselor': counselor,
'score': score,
})
return ret
def rank(self, uid, counselors):
counselors = counselors.strip()
if not counselors:
return []
counselors = counselors.split(',')
user_embedding = self.get_user_embedding(uid)
counselor_embeddings = self.get_counselor_embeddings(counselors)
return self._rank(counselors, user_embedding, counselor_embeddings)
if __name__ == '__main__':
ranker = Ranker()
print(ranker.rank('13577448', '17607,4792,4202,13588,11600,13242'))
print(ranker.rank('0', '17607,4792,4202,13588,11600,0'))
\ No newline at end of file
# -*- coding: utf-8 -*-
import json
import random
from collections import Counter
from ydl_ai_recommender.src.core.manager import (
CounselorProfileManager,
UserEventManager,
)
from ydl_ai_recommender.src.core.profile import encode_user_profile
from ydl_ai_recommender.src.core.counselor_profile import encode_counselor_profile
from ydl_ai_recommender.src.core.rank.ranker import Ranker
ranker = Ranker(is_use_db=False)
def make_data(start_date, end_date):
# 平台用户数量太多,仅处理与本次任务相关的用户
manager = UserEventManager()
user_profiles = manager.load_xlsx_data(f'expose_click_user_{start_date}_{end_date}.xlsx')
user_embeddings = {}
for _, profile in user_profiles.iterrows():
user_embeddings[str(profile['uid'])] = encode_user_profile(profile)
counselor_profile_manager = CounselorProfileManager()
counselor_profiles = counselor_profile_manager.load_xlsx_data('all_counselor_profile.xlsx')
counselor_embeddings = {}
for _, profile in counselor_profiles.iterrows():
counselor_embeddings[str(profile['doctor_id'])] = encode_counselor_profile(profile)
# 曝光点击数据
test_data = []
expost_click = manager.load_data(start_date=start_date, end_date=end_date)
for _, profile in expost_click.iterrows():
uid = str(profile['user_ids'])
if ',' in uid:
continue
if uid not in user_embeddings:
u_emb = ranker.default_user_embedding()
else:
u_emb = user_embeddings[uid]
expose_ids = set(str(profile['ae_doctor_ids']).split(','))
click_ids = set(str(profile['doctor_card_doctor_ids']).split(',') + str(profile['private_chat_doctor_ids']).split(','))
uninterested_ids = list(expose_ids - click_ids)
for _id in click_ids:
click_id = _id
if _id in user_embeddings:
click_emb = counselor_embeddings[_id]
else:
click_emb = ranker.default_counselor_embedding()
c_embs = [click_emb]
c_ids = [click_id]
for _id in random.choices(uninterested_ids, k=min(19, len(uninterested_ids))):
if _id not in counselor_embeddings:
continue
c_embs.append(counselor_embeddings[_id])
c_ids.append(_id)
if len(c_embs) < 2:
continue
test_data.append([
c_ids,
u_emb,
c_embs,
])
print('test data row: ', len(test_data))
return test_data
def test():
start_date = '2023-02-01'
end_date = '2023-02-10'
test_data = make_data(start_date, end_date)
result_detail = []
for [c_ids, u_emb, c_embs] in test_data:
rank_result = ranker._rank(c_ids, u_emb, c_embs)
for i, result in enumerate(rank_result):
if result['counselor'] == c_ids[0]:
result_detail.append(i)
break
else:
result_detail.append(-1)
cnter = Counter(result_detail)
acc_detail = {}
for key, val in cnter.most_common():
acc_detail[key] = val
# print(key, val, val/len(result_detail))
print('==' * 20)
cnt = 0
for n in range(10):
if n in (1, 3, 5, 10):
print('top{:2}\t{:5}\t{:.2%}'.format(n, cnt, cnt / len(result_detail)))
cnt += acc_detail[n]
if __name__ == '__main__':
test()
\ No newline at end of file
# -*- coding: utf-8 -*-
import os
import json
import pickle
import argparse
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
models = {
'LR': LogisticRegression(),
'LR_SS': make_pipeline(StandardScaler(), LogisticRegression()),
'MLP': MLPClassifier(),
'KNN': KNeighborsClassifier(),
'SVM': SVC(),
'DT': DecisionTreeClassifier(),
'RF': RandomForestClassifier(),
'Ada': AdaBoostClassifier(),
}
def main(args):
with open(os.path.join(args.data_path, 'all_train.json'), 'r') as f:
train_data = json.load(f)
x_train, y_train = [], []
for x, y in train_data:
x_train.append(x)
y_train.append(y)
model = models[args.model]
print('开始训练模型 ', args.model)
model.fit(x_train, y_train)
print('模型训练完成')
model_save_path = os.path.join(args.save_path, args.model + '.pkl')
with open(model_save_path, 'wb') as f:
pickle.dump(model, f)
print('模型已保存至 ', model_save_path)
if args.do_test:
with open('data/all_test.json', 'r') as f:
test_data = json.load(f)
x_test, y_test = [], []
for x, y in test_data:
x_test.append(x)
y_test.append(y)
print('{}\t{}'.format(args.model, model.score(x_test, y_test)))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='壹点灵 咨询师推荐 精排模型训练')
parser.add_argument('-m', '--model', type=str, default='LR', choices=list(models.keys()), help='模型类型')
parser.add_argument('--data_path', type=str, default='./data', help='训练数据存放目录')
parser.add_argument('--save_path', type=str, default='./model', help='训练好的模型存放目录')
parser.add_argument('--do_test', default=False, action='store_true', help='训练完成后执行测试程序')
args = parser.parse_args()
main(args)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment