Commit 9a5cbe24 by 王金柱

增加基于树模型XGBoost排序

parent a65552b2
This source diff could not be displayed because it is too large. You can view the blob instead.
{"ffrom_login_encoder": {"android_xiaomi": 76, "android_vivo": 74, "android_oppo": 72, "android_huawei": 62, "0": 0, "ATK_7_android_huawei": 18, "m_login_default": 88, "TK_yyjlcs": 55, "app_ios": 79, "m_ydl": 91, "TK_4": 53, "android_10": 57, "yj_listeners_tengxun": 102, "ATK_android_yyjlcs_huawei": 31, "ATK_7_android_oppo": 22, "login_miss_ffrom_default": 86, "AppletWechatListen": 47, "ATK_4_android_yyb": 12, "ATK_7_android_xiaomi": 26, "zj": 106, "yj_listeners_douyin": 101, "ATK_android_yyjlcs_oppo": 34, "ATK_android_yyjlcs_vivo": 35, "ATK_android_yyjlcs_xiaomi": 37, "ios_1": 81, "android_yyb": 77, "login_api_sms_missing_default": 85, "android_m": 64, "ATK_7_android_360": 16, "ATK_4_android_vivo": 9, "ATK_7_android_yyb": 27, "m_wx_app": 90, "AppletWechatYdlPsyConsult": 48, "ATK_7_android_baidu": 17, "ATK_4_android_baidu": 4, "AppletWechatAiXiaoyi": 45, "ATK_android_yyjlcs_lenovo": 32, "ATK_android_yyjlcs_baidu": 30, "mini_shiyebu": 93, "ATK_7_android_vivo": 24, "AppletWechatCourseFuLi": 46, "login_api_phone_missing_default": 84, "android_sanxing": 73, "ATK_android_yyjlcs_wdj": 36, "android_medical_xiaomi": 69, "pinganbaoxian": 95, "ATK_7_android_letv": 20, "zhonganhuyi": 105, "BG-platform": 49, "android_zhuzhan": 78, "ATK_android_yyjlcs_yyb": 38, "ATK_7_android_zhuzhan": 28, "ATK_4_android_meizu": 6, "android_baidu": 61, "ATK_android_yyjlcs_360": 29, "ATK_4_android_xiaomi": 11, "m": 87, "ATK_7_android_sanxing": 23, "ydl-dhzx": 97, "ydl-pro": 99, "TK_qinggan": 54, "m_wechat": 89, "yj_listeners_ydlxlgzh": 103, "android_meizu": 71, "BG-xinting": 50, "android_appdouyin": 60, "android_medical_doctor": 65, "mini_shrink": 94, "swan": 96, "ATK_6_android_huawei": 14, "ATK_android_yyjlcs_meizu": 33, "android_medical_huawei": 66, "android_360": 58, "ATK_7_android_meizu": 21, "ATK_7_android_lenovo": 19, "ATK_4_android_huawei": 5, "android_medical_vivo": 68, "ATK_4_android_oppo": 7, "ATK_4_android_sanxing": 8, "ATK_7_android_wdj": 25, "ATK_yyjlcs_360cn": 42, "yj_consultant_tengxun": 100, "android_medical_oppo": 67, "ios_medical_user": 83, "ATK_4_android_360": 3, "ATK_4_android_wdj": 10, "ios_medical_doctor": 82, "ATK_qinggan": 40, "android_Cpd_Honor": 59, "ydl-emotion": 98, "BaiduSem_AppAndroidYdl": 51, "ATK_3_android_baidu": 2, "android_lenovo": 63, "baidu": 80, "yyb": 104, "android_wdj": 75, "alipay": 56, "ATK_6_android_baidu": 13, "ATK_yyjlcs_xiaomi": 44, "ATK_10_android_huawei": 1, "ATK_6_android_meizu": 15, "ATK_yyjlcs_null": 43, "android_medical_yyb": 70, "ATK_ylcs_xiaomi": 41, "meizu": 92, "ATK_aqcs_yyb": 39, "BaiduSem_zhuzhan": 52}, "user_login_city_encoder": {"0": 0, "440800": 200, "370200": 131, "411500": 161, "440300": 195, "530100": 267, "451400": 228, "120000": 3, "500000": 245, "310000": 69, "330700": 89, "440100": 193, "330200": 84, "610100": 276, "420100": 164, "330100": 83, "520100": 264, "410100": 146, "450200": 216, "441500": 205, "450300": 217, "370600": 135, "110000": 1, "410400": 149, "340300": 96, "360700": 125, "330600": 88, "331000": 92, "320100": 70, "370100": 130, "510100": 246, "441200": 202, "630100": 296, "130100": 4, "230100": 57, "320200": 71, "130900": 12, "410300": 148, "440700": 199, "451100": 225, "451300": 227, "650800": 310, "650500": 307, "370800": 137, "220100": 49, "445200": 213, "431200": 191, "330300": 85, "460100": 229, "450800": 222, "511100": 255, "320500": 74, "411100": 157, "441901": 210, "430100": 180, "411400": 160, "140800": 22, "450700": 221, "520300": 266, "370500": 134, "130700": 10, "610600": 281, "610500": 280, "220500": 53, "320300": 72, "350100": 110, "360100": 119, "440500": 197, "410800": 153, "131000": 13, "140900": 23, "440600": 198, "540100": 275, "150500": 30, "370700": 136, "420600": 168, "340100": 94, "340400": 97, "431000": 189, "371700": 145, "130600": 9, "630200": 297, "350800": 117, "610400": 279, "340700": 100, "510900": 253, "630300": 298, "421100": 173, "340200": 95, "330500": 87, "140300": 17, "150100": 26, "231000": 66, "620100": 284, "220700": 55, "130800": 11, "442000": 211, "210200": 36, "361000": 128, "140500": 19, "610700": 282, "341300": 105, "350600": 115, "450100": 215, "421300": 175, "450500": 219, "441400": 204, "320700": 76, "140100": 15, "510400": 248, "370900": 138, "320400": 73, "130200": 5, "371300": 141, "360400": 122, "230500": 61, "650100": 303, "431300": 192, "130500": 8, "430200": 181, "511700": 260, "341700": 108, "370400": 133, "530800": 273, "371500": 143, "210100": 35, "350900": 118, "431100": 190, "340600": 99, "141000": 24, "420300": 166, "150600": 31, "610800": 283, "341200": 104, "511500": 258, "350200": 111, "511300": 256, "210800": 42, "230300": 59, "210900": 43, "140700": 21, "350500": 114, "350700": 116, "141100": 25, "441300": 203, "421200": 174, "445100": 212, "PHBWH00": 362, "530400": 269, "150400": 29, "210400": 38, "441600": 206, "510700": 251, "450900": 223, "441800": 208, "130300": 6, "430900": 188, "360300": 121, "441900": 209, "640100": 301, "371400": 142, "621200": 295, "211400": 48, "510600": 250, "530300": 268, "610300": 278, "231200": 68, "621000": 293, "371600": 144, "150900": 34, "211300": 47, "430500": 184, "430400": 183, "410500": 150, "530900": 274, "411600": 162, "211100": 45, "620900": 292, "211200": 46, "320900": 78, "441700": 207, "512000": 263, "620600": 289, "361100": 129, "510800": 252, "620500": 288, "330400": 86, "411300": 159, "231100": 67, "130400": 7, "341000": 102, "320600": 75, "650300": 305, "230800": 64, "420200": 165, "440400": 196, "350400": 113, "330900": 91, "440900": 201, "210700": 41, "420900": 171, "320800": 77, "371000": 139, "321300": 82, "429005": 177, "360900": 127, "430800": 187, "340500": 98, "520200": 265, "350300": 112, "210600": 40, "140200": 16, "410700": 152, "651000": 311, "511600": 259, "42A2100": 179, "420500": 167, "450400": 218, "511000": 254, "230200": 58, "651300": 314, "411000": 156, "410200": 147, "360200": 120, "430300": 182, "220800": 56, "430600": 185, "NSHF00": 354, "445300": 214, "370300": 132, "410900": 155, "131100": 14, "410600": 151, "510500": 249, "650700": 309, "421000": 172, "510300": 247, "621100": 294, "330800": 90, "NSWHBS00": 355, "321000": 79, "140400": 18, "620700": 290, "331100": 93, "341500": 106, "220200": 50, "440200": 194, "150800": 33, "630400": 299, "511900": 262, "371100": 140, "321100": 80, "411700": 163, "420700": 169, "PKTAS00": 364, "430700": 186, "410881": 154, "511800": 261, "650600": 308, "420800": 170, "NY0000": 357, "651500": 316, "341600": 107, "46A3100": 239, "QLDBNE00": 365, "651200": 313, "469005": 233, "110105": 2, "WASEA00": 380, "650200": 304, "360800": 126, "CASFO00": 330, "511400": 257, "150300": 28, "451000": 224, "651400": 315, "450600": 220, "620800": 291, "210300": 37, "150200": 27, "411200": 158, "451200": 226, "230400": 60, "429004": 176, "610200": 277, "321200": 81, "630500": 300, "460200": 230, "230600": 62, "360600": 124, "341800": 109, "651100": 312, "340800": 101, "530500": 270, "530600": 271, "620400": 287, "MOSTL00": 352, "429006": 178, "341100": 103, "46A3300": 240, "210500": 39, "46A2500": 235, "220300": 51, "KAZAKS00": 346, "140600": 20, "SCTARD00": 369, "150700": 32, "211000": 44, "230900": 65, "620200": 285, "JPN2700": 345, "640200": 302, "JPN1300": 344, "620300": 286, "PHLMNL00": 363, "650400": 306, "530700": 272, "220400": 52, "ILCHI00": 339, "CANTOR00": 328, "KLKUL00": 348, "VICMEL00": 375, "360500": 123, "ITAROM00": 342, "46A3500": 242, "230700": 63, "KL0000": 347, "810100": 319, "PAPHL00": 361, "SPE0000": 370, "RUSMOW00": 366, "810300": 320, "46A2700": 236, "DEUNUE00": 334, "NZLAUK00": 358, "469006": 234, "CALAX00": 323, "CANVAN00": 329, "46A2800": 237, "FRAPAR00": 338, "SAADL00": 368, "MEXMEX00": 350, "930100": 321, "IRLDB00": 341, "220600": 54, "ENGLND00": 335, "46A3400": 241, "AREDU00": 322, "VNMHI00": 376, "469002": 231, "WAPER00": 379, "DEUHH00": 332, "WA0000": 377, "TH-100000": 371, "CANSUD00": 327, "CANMTR00": 325, "469003": 232, "CANLOD00": 324, "THA1000": 372, "ESPVLL00": 337, "NZLHLZ00": 359, "TXAUS00": 373, "CHLRM00": 331, "49SEO00": 244, "NVLAS00": 356, "RUSSPE00": 367, "TXDAL00": 374, "ESPMAD00": 336, "MNG6100": 351, "NLDAMS00": 353, "OR0000": 360, "IRLCK00": 340, "JK0000": 343, "WAALH00": 378, "651600": 317, "MABZD00": 349, "46A3600": 243, "CANOTT00": 326, "DEUKEL00": 333, "651700": 318, "46A3000": 238}, "cate_id_1_encoder": {"26": 5, "23": 3, "0": 0, "27": 6, "1": 1, "452": 7, "22": 2, "699": 8, "25": 4}, "cate_id_2_encoder": {"22": 2, "25": 4, "0": 0, "27": 6, "452": 7, "26": 5, "699": 8, "1": 1, "23": 3}, "cate_id_3_encoder": {"27": 6, "452": 7, "0": 0, "22": 2, "1": 1, "25": 4, "23": 3, "26": 5, "699": 8}, "cate_id_4_encoder": {"25": 4, "1": 1, "0": 0, "23": 3, "452": 7, "699": 8, "26": 5, "27": 6, "22": 2}, "cate_id_5_encoder": {"452": 7, "27": 6, "0": 0, "1": 1, "26": 5, "22": 2, "23": 3, "25": 4, "699": 8}}
\ No newline at end of file
...@@ -5,9 +5,12 @@ import json ...@@ -5,9 +5,12 @@ import json
import configparser import configparser
from typing import List from typing import List
import xgboost as xgb
import time
import faiss import faiss
import requests import requests
import numpy as np import numpy as np
import pandas as pd
from ydl_ai_recommender.src.core.indexer import ( from ydl_ai_recommender.src.core.indexer import (
UserCounselorChatIndexer, UserCounselorChatIndexer,
...@@ -17,9 +20,10 @@ from ydl_ai_recommender.src.core.indexer import ( ...@@ -17,9 +20,10 @@ from ydl_ai_recommender.src.core.indexer import (
CounselorCounselorCFIndexer, CounselorCounselorCFIndexer,
) )
from ydl_ai_recommender.src.core.profile import encode_user_profile from ydl_ai_recommender.src.core.profile import encode_user_profile
from ydl_ai_recommender.src.utils import get_conf_path, get_data_path from ydl_ai_recommender.src.utils import get_conf_path, get_data_path, read_user_encoder_dict, read_counselors, get_project_path
from ydl_ai_recommender.src.utils.log import create_logger from ydl_ai_recommender.src.utils.log import create_logger
logger = create_logger(__name__, 'service_xgb.log', is_rotating=True)
class Recommender(): class Recommender():
...@@ -240,7 +244,167 @@ class ItemCFRecommender(Recommender): ...@@ -240,7 +244,167 @@ class ItemCFRecommender(Recommender):
counselors = counselors[:size] counselors = counselors[:size]
return counselors return counselors
class RecommendByXgboost(Recommender):
def __init__(self, top_n=5, k=20, is_use_db=True, u2c='combination', c2c=None) -> None:
super().__init__()
config = configparser.RawConfigParser()
config.read(get_conf_path())
self.dmp_url = config.get('DMP', 'url')
select_items = ['uid', 'ffrom_login', 'user_login_city', 'user_preference_cate']
self.select_fields = {k: True for k in select_items}
self.user_encoder_convert = read_user_encoder_dict()
self.all_counselors = read_counselors()
#self.recommender = UserCFRecommender(top_n=top_n, k=k, u2c=u2c)
self.params = {'n_estimators': 150, 'max_depth': 7, 'min_child_weight': 5, 'gamma': 0, 'subsample': 0.9,
'colsample_bytree': 0.5, 'reg_alpha': 0, 'reg_lambda': 1, 'learning_rate': 0.1,
'max_delta_step': 0,
'scale_pos_weight': 1}
self.model = xgb.XGBClassifier(objective='binary:logistic', nthread=-1, **self.params)
self.model.load_model(os.path.join(get_project_path(), 'model_data/xgb_model.bin'))
def recommend(self, user_id, size=0, is_merge=True):
s_u_profile_time = time.time()
user_profile = self.get_user_profile(user_id)
logger.info('s_u_profile_time: {} '.format(time.time()-s_u_profile_time))
if not user_profile:
return self._recommend_top(size)
recall_start = time.time()
# recommend_result = self.recommender.recommend(user_id, size=size, is_merge=True)
recommend_result = None
logger.info('recall call time:{}'.format(time.time()-recall_start))
data_time = time.time()
predit_data = self.trans_feature_data(user_id, recommend_result)
logger.info('data_time: {}'.format(time.time()-data_time))
doctor_ids = predit_data.pop('doctor_id')
doctor_ids = doctor_ids.to_numpy()
pre_time = time.time()
predit_result = self.model.predict_proba(predit_data)[:, 1]
logger.info('predit_time:{}'.format(time.time()-pre_time))
result_dict = dict(zip(doctor_ids, predit_result))
result_dict = sorted(result_dict.items(), key=lambda x:x[1], reverse=True)
recommend_data = [{
'counselor': int(c_id),
'score': float(proba),
'from': 'similar_users {}'.format(user_id),
} for (c_id, proba) in result_dict[0:50]]
return recommend_data
def trans_feature_data(self, user_id, counselor_data):
user_feature_data = self.trans_user_feature_data(user_id)
counselor_feature_data = self.trans_counselor_feature_data(counselor_data)
counselor_num = len(counselor_feature_data)
user_feature_data_dataframe = pd.DataFrame([user_feature_data]*counselor_num, columns=['ffrom_login_encoder'\
, 'user_login_city_encoder', 'cate_id_1_encoder', 'cate_id_2_encoder', 'cate_id_3_encoder'\
, 'cate_id_4_encoder', 'cate_id_5_encoder'])
predit_feature_data = pd.concat([user_feature_data_dataframe, counselor_feature_data], axis=1)
return predit_feature_data
def trans_user_feature_data(self, user_id):
user_profile = self.get_user_profile(user_id)
from_login_encoder = self.get_encoder_from_dict('ffrom_login', user_profile['ffrom_login'])
user_login_city_encoder = self.get_encoder_from_dict('user_login_city', user_profile['user_login_city'])
user_preference_cate = user_profile['user_preference_cate']
user_preference_cate_top_5_encoder = self.process_user_preference_cate(user_preference_cate)
user_feature_data = [from_login_encoder, user_login_city_encoder]
user_feature_data.extend(user_preference_cate_top_5_encoder)
return user_feature_data
def trans_counselor_feature_data(self, counselor_data):
# counselor_ids = [str(item['counselor']) for item in counselor_data]
# counselor_profiles = self.all_counselors[self.all_counselors['doctor_id'].isin(counselor_ids)].reset_index(drop=True)
# return counselor_profiles
return self.all_counselors
def get_user_profile(self, user_id):
if user_id == '0':
return []
headers = {
'X-App-Id': 'plough_cloud',
'Content-Type': 'application/json'
}
payload = {
"filter": {
"uid": user_id,
},
"fields": self.select_fields,
"limit": 10
}
try:
get_profile_time = time.time()
response = requests.request('POST', self.dmp_url, headers=headers, json=payload)
self.logger.info(' get user profile cost {} ms'.format((time.time()-get_profile_time)*1000))
resp = response.json()
return resp['data']['objects'][0]
except Exception as e:
self.logger.error('获取用户画像数据失败: %s', e, exc_info=True)
try:
self.logger.exception('response json data %s', resp)
except:
pass
return []
def process_user_preference_cate(self, preference_cate):
result = [0, 0, 0, 0, 0]
ids = []
if isinstance(preference_cate, str):
pref_data = json.loads(preference_cate)
for info in pref_data:
ids.append(info['cate_id'])
ids = ids[0:min(5, len(ids))]
for ind, val in enumerate(ids):
result[ind] = val
encoder_result = []
for ind, val in enumerate(result):
value_convert_dict = self.user_encoder_convert.get('cate_id_{}_encoder'.format(ind+1))
if value_convert_dict is not None:
encoder_result.append(value_convert_dict.get(val, 0))
if len(encoder_result)<5:
encoder_result.extend([0]*(5-len(encoder_result)))
return encoder_result
def get_encoder_from_dict(self, feature_name, feature_value):
value_convert_dict = self.user_encoder_convert.get('{}_encoder'.format(feature_name))
if value_convert_dict is None:
return 0
return value_convert_dict.get(str(feature_value), 0)
def aa(self):
pass
if __name__ == '__main__': if __name__ == '__main__':
recommender = UserCFRecommender() s_time = time.time()
print(recommender.recommend('12047')) recommender1 = UserCFRecommender()
\ No newline at end of file recommender1.recommend('30004410')
print('all cost time: {}'.format(time.time() - s_time), recommender1.recommend('12047'))
print()
print()
s_time = time.time()
recommender = RecommendByXgboost()
recommender.recommend('30004410')
print('all cost time: '.format(time.time()-s_time), recommender.recommend('12047'))
print()
print()
s_time = time.time()
recommender.recommend('30004410')
print('all cost time: '.format(time.time() - s_time), recommender.recommend('12047'))
\ No newline at end of file
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
import tornado.web import tornado.web
...@@ -73,13 +75,71 @@ class RecommendHandler(tornado.web.RequestHandler): ...@@ -73,13 +75,71 @@ class RecommendHandler(tornado.web.RequestHandler):
logger.info('response@@uid=%s@@ret=%s', uid, ret_str) logger.info('response@@uid=%s@@ret=%s', uid, ret_str)
return ret return ret
class RecommendXgbHandler(tornado.web.RequestHandler):
executor = ThreadPoolExecutor(1)
@tornado.gen.coroutine
def get(self):
uid = self.get_argument('uid', None)
if uid is None:
logger.warn('请求参数不正确,无uid')
size = self.get_argument('size', 100)
try:
size = int(size)
except Exception as e:
logger.warn('size=%s 不是数字', size)
size = 100
ret = yield self.run(uid, size)
self.write(ret)
@tornado.gen.coroutine
def post(self):
param = json.loads(self.request.body.decode('utf-8'))
uid = param.get('uid', None)
size = param.get('size', 100)
if uid is None:
logger.warn('请求参数不正确,无uid')
ret = yield self.run(uid, size)
self.write(ret)
@run_on_executor
def run(self, uid, size=100):
logger.info('request@@uid=%s@@size=%s', uid, size)
try:
start_time = time.time()
recommend_result = recommender.recommend(uid, size=size, is_merge=True)
logger.info('request@@uid=%s@@size=%s, cost %s ms', uid, size, (time.time()-start_time)*1000)
ret = {
'status': 'success',
'code': 0,
'data': recommend_result,
'total_count': len(recommend_result),
}
except Exception as e:
logger.error('执行推荐函数报错', exc_info=True)
ret = {
'status': 'error',
'code': 1,
'data': [],
'total_count': 0,
}
ret_str = json.dumps(ret, ensure_ascii=False)
logger.info('response@@uid=%s@@ret=%s', uid, ret_str)
return ret
if __name__ == '__main__': if __name__ == '__main__':
tornado.options.define('port', default=8868, type=int, help='服务启动的端口号') tornado.options.define('port', default=8868, type=int, help='服务启动的端口号')
tornado.options.parse_command_line() tornado.options.parse_command_line()
app = tornado.web.Application(handlers=[(r'/ai_counselor_recommend', RecommendHandler)], autoreload=False, debug=False) app = tornado.web.Application(handlers=[(r'/ai_counselor_recommend', RecommendHandler),
(r'/ai_counselor_recommend/xgb/v1', RecommendXgbHandler)]
, autoreload=False, debug=False)
http_server = tornado.httpserver.HTTPServer(app) http_server = tornado.httpserver.HTTPServer(app)
http_server.listen(tornado.options.options.port) http_server.listen(tornado.options.options.port)
tornado.ioloop.IOLoop.instance().start() tornado.ioloop.IOLoop.instance().start()
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import json
import pandas as pd
def get_project_path(): def get_project_path():
...@@ -21,4 +23,15 @@ def get_conf_path(): ...@@ -21,4 +23,15 @@ def get_conf_path():
def get_model_path(): def get_model_path():
project_path = get_project_path() project_path = get_project_path()
return os.path.join(project_path, 'model') return os.path.join(project_path, 'model')
\ No newline at end of file
def read_user_encoder_dict():
user_encoder_json_data = None
project_path = get_project_path()
with open(os.path.join(project_path, 'model_data/user_encoder_json_data.json'), 'r') as f:
user_encoder_json_data = json.load(f)
return user_encoder_json_data
def read_counselors():
project_path = get_project_path()
return pd.read_csv(os.path.join(project_path, 'model_data/doctor_profile_selected_feature.csv'), sep=',', index_col=0, dtype={'doctor_id':str})
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment