Commit 33d575d5 by 王金柱

精排序

parent 6fe86fc6
# -*- coding:utf-8 -*-
import time
import requests
import os
import json
import configparser
import xgboost as xgb
import pandas as pd
import numpy as np
from ydl_ai_recommender.src.core.profile import encode_user_profile
from ydl_ai_recommender.src.utils import get_conf_path, get_data_path, read_user_encoder_dict, read_counselors, get_project_path
from ydl_ai_recommender.src.utils.log import create_logger
logger = create_logger(__name__, 'rankByXgb.log')
def cost_time(desc):
def the_func_cost_time(func):
def fun(*args,**kwargs):
t = time.perf_counter()
result = func(*args, **kwargs)
logger.info('函数:{},{} 耗时:{} ms'.format(str(func.__name__), desc, round((time.perf_counter()-t)*1000, 2)))
return result
return fun
return the_func_cost_time
class RankByXGB:
def __init__(self, is_use_db=True) -> None:
self.logger = create_logger(__name__, 'rankByXgb.log')
config = configparser.RawConfigParser()
config.read(get_conf_path())
self.dmp_url = config.get('DMP', 'url')
select_items = ['uid', 'ffrom_login', 'user_login_city', 'user_preference_cate']
self.select_fields = {k: True for k in select_items}
self.user_encoder_convert = read_user_encoder_dict()
self.all_counselors = read_counselors()
self.params = {'n_estimators': 150, 'max_depth': 7, 'min_child_weight': 5, 'gamma': 0, 'subsample': 0.9,
'colsample_bytree': 0.5, 'reg_alpha': 0, 'reg_lambda': 1, 'learning_rate': 0.1,
'max_delta_step': 0,
'scale_pos_weight': 1}
self.model = xgb.XGBClassifier(objective='binary:logistic', nthread=-1, **self.params)
self.model.load_model(os.path.join(get_project_path(), 'model_data/xgb_model.bin'))
@cost_time(desc='模型推荐整个流程')
def rank(self, user_id, counselors):
if not counselors or not(counselors.strip()):
return []
counselors = counselors.strip()
counselors = counselors.split(',')
user_profile = self.get_user_profile(user_id)
if not user_profile:
ret = []
for counselor_id in counselors:
ret.append({
'counselor': str(counselor_id),
'score': 0.0,
})
return ret
predit_data = self.trans_feature_data(user_id, user_profile, counselors)
doctor_ids = predit_data.pop('doctor_id')
doctor_ids = doctor_ids.to_numpy()
pre_time = time.time()
predit_result = self.model.predict_proba(predit_data)[:, 1]
self.logger.info('predit_time:{}ms'.format(int((time.time()-pre_time)*1000)))
result_dict = dict(zip(doctor_ids, predit_result))
result_dict = sorted(result_dict.items(), key=lambda x:x[1], reverse=True)
recommend_data = [{
'counselor': str(c_id),
'score': round(float(proba), 4),
} for (c_id, proba) in result_dict]
return recommend_data
@cost_time(desc='')
def trans_feature_data(self, user_id, user_profile, counselors):
user_feature_data = self.trans_user_feature_data(user_id, user_profile)
counselor_feature_data = self.trans_counselor_feature_data(counselors)
counselor_num = len(counselor_feature_data)
user_feature_data_dataframe = pd.DataFrame([user_feature_data]*counselor_num, columns=['ffrom_login_encoder'\
, 'user_login_city_encoder', 'cate_id_1_encoder', 'cate_id_2_encoder', 'cate_id_3_encoder'\
, 'cate_id_4_encoder', 'cate_id_5_encoder'])
predit_feature_data = pd.concat([user_feature_data_dataframe, counselor_feature_data], axis=1)
return predit_feature_data
def trans_user_feature_data(self, user_id, user_profile):
if not user_profile:
user_profile = self.get_user_profile(user_id)
from_login_encoder = self.get_encoder_from_dict('ffrom_login', user_profile['ffrom_login'])
user_login_city_encoder = self.get_encoder_from_dict('user_login_city', user_profile['user_login_city'])
user_preference_cate = user_profile['user_preference_cate']
user_preference_cate_top_5_encoder = self.process_user_preference_cate(user_preference_cate)
user_feature_data = [from_login_encoder, user_login_city_encoder]
user_feature_data.extend(user_preference_cate_top_5_encoder)
return user_feature_data
def trans_counselor_feature_data(self, counselor_ids):
counselor_profiles = self.all_counselors[self.all_counselors['doctor_id'].isin(counselor_ids)].reset_index(drop=True)
return counselor_profiles
@cost_time(desc='获取用户画像')
def get_user_profile(self, user_id):
if user_id == '0':
return []
headers = {
'X-App-Id': 'plough_cloud',
'Content-Type': 'application/json'
}
payload = {
"filter": {
"uid": user_id,
},
"fields": self.select_fields,
"limit": 10
}
try:
get_profile_time = time.time()
response = requests.request('POST', self.dmp_url, headers=headers, json=payload)
resp = response.json()
return resp['data']['objects'][0]
except Exception as e:
self.logger.error('获取用户画像数据失败: %s', e, exc_info=True)
try:
self.logger.exception('response json data %s', resp)
except:
pass
return []
def process_user_preference_cate(self, preference_cate):
result = [0, 0, 0, 0, 0]
ids = []
if isinstance(preference_cate, str):
pref_data = json.loads(preference_cate)
for info in pref_data:
ids.append(info['cate_id'])
ids = ids[0:min(5, len(ids))]
for ind, val in enumerate(ids):
result[ind] = val
encoder_result = []
for ind, val in enumerate(result):
value_convert_dict = self.user_encoder_convert.get('cate_id_{}_encoder'.format(ind+1))
if value_convert_dict is not None:
encoder_result.append(value_convert_dict.get(val, 0))
if len(encoder_result)<5:
encoder_result.extend([0]*(5-len(encoder_result)))
return encoder_result
def get_encoder_from_dict(self, feature_name, feature_value):
value_convert_dict = self.user_encoder_convert.get('{}_encoder'.format(feature_name))
if value_convert_dict is None:
return 0
return value_convert_dict.get(str(feature_value), 0)
if __name__ == '__main__':
ranker = RankByXGB()
print(ranker.rank('12047', '37298,21144,12019,13010,11038,2830'))
print(ranker.rank('0', '37298,21144,12019,13010,11038,2830'))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment