增加精排模型

b3206578 · 柴鹏飞 · 31041a93 · b3206578 · b3206578 · b3206578
Commit b3206578 authored Mar 07, 2023 by 柴鹏飞
Showing with 379 additions and 0 deletions

preprocess.py src/core/rank/preprocess.py +77 -0

ranker.py src/core/rank/ranker.py +121 -0

test.py src/core/rank/test.py +112 -0

train.py src/core/rank/train.py +69 -0

No files found.
--- a/src/core/rank/preprocess.py
+++ b/src/core/rank/preprocess.py
+# -*- coding: utf-8 -*-
+
+import json
+import random
+
+from ydl_ai_recommender.src.core.manager import (
+    CounselorProfileManager,
+    UserEventManager,
+)
+from ydl_ai_recommender.src.core.profile import encode_user_profile
+from ydl_ai_recommender.src.core.counselor_profile import encode_counselor_profile
+
+
+def make_data(start_date, end_date, type):
+    
+    # 平台用户数量太多，仅处理与本次任务相关的用户
+    manager = UserEventManager()
+    user_profiles = manager.load_xlsx_data(f'expose_click_user_{start_date}_{end_date}.xlsx')
+    user_embeddings = {}
+    for _, profile in user_profiles.iterrows():
+        user_embeddings[str(profile['uid'])] = encode_user_profile(profile)
+
+    counselor_profile_manager = CounselorProfileManager()
+    counselor_profiles = counselor_profile_manager.load_xlsx_data('all_counselor_profile.xlsx')
+
+    counselor_embeddings = {}
+    for _, profile in counselor_profiles.iterrows():
+        counselor_embeddings[str(profile['doctor_id'])] = encode_counselor_profile(profile)
+
+    # 曝光点击数据
+    train_data = []
+    expost_click = manager.load_data(start_date=start_date, end_date=end_date)
+    for _, profile in expost_click.iterrows():
+        uid = str(profile['user_ids'])
+        if ',' in uid:
+            continue
+
+        if uid not in user_embeddings:
+            continue
+        
+        expose_ids = set(str(profile['ae_doctor_ids']).split(','))
+        click_ids = set(str(profile['doctor_card_doctor_ids']).split(',') + str(profile['private_chat_doctor_ids']).split(','))
+        click_ids2 = [_id for _id in click_ids if _id in counselor_embeddings]
+
+        for _id in click_ids2:
+            train_data.append([user_embeddings[uid] + counselor_embeddings[_id], 1])
+        
+        uninterested_ids = list(expose_ids - click_ids)
+        for _id in random.choices(uninterested_ids, k=min(len(click_ids2), len(uninterested_ids))):
+            if _id not in counselor_embeddings:
+                continue
+            train_data.append([user_embeddings[uid] + counselor_embeddings[_id], 0])
+
+    print('data count: ', len(train_data))
+    print('label=0\t{}'.format(len(list(filter(lambda x: x[1] == 0, train_data)))))
+    print('label=1\t{}'.format(len(list(filter(lambda x: x[1] == 1, train_data)))))
+
+    with open(f'data/all_{type}.json', 'w') as f:
+        json.dump(train_data, f)
+    
+
+def make_train_data():
+    start_date = '2023-01-01'
+    end_date = '2023-02-01'
+    make_data(start_date, end_date, 'train')
+
+
+def make_test_data():
+    start_date = '2023-02-01'
+    end_date = '2023-02-10'
+    make_data(start_date, end_date, 'test')
+
+
+if __name__ == '__main__':
+    make_train_data()
+    make_test_data()
\ No newline at end of file
--- a/src/core/rank/ranker.py
+++ b/src/core/rank/ranker.py
+# -*- coding: utf-8 -*-
+
+import os
+import pickle
+from typing import List
+
+from ydl_ai_recommender.src.utils.log import create_logger
+from ydl_ai_recommender.src.utils import get_conf_path, get_model_path
+from ydl_ai_recommender.src.data.mysql_client import MySQLClientPool
+from ydl_ai_recommender.src.core.manager import CounselorProfileManager
+from ydl_ai_recommender.src.core.profile import encode_user_profile
+from ydl_ai_recommender.src.core.counselor_profile import encode_counselor_profile
+
+
+class Ranker():
+    def __init__(self, is_use_db=True) -> None:
+        self.logger = create_logger(__name__, 'ranker.log')
+        if is_use_db:
+            self.client = MySQLClientPool.create_from_config_file(get_conf_path())
+        else:
+            self.client = None
+            self.logger.warn('未连接数据库')
+
+        self.select_items_str = ', '.join([
+            'uid', 'country_code', 'channel_id_type', 'ffrom_login', 'user_preference_cate', 'consult_pay_money',
+            'listen_pay_money', 'test_items_pay_money', 'course_pay_money', 'consult_order_num',
+            'listen_order_num', 'test_items_order_num', 'course_order_num', 'aidi_cst_bias_city',
+            'aidi_cst_bias_sex', 'aidi_cst_bias_price', 'aidi_cst_bias_server_type', 'user_login_city',
+            'd30_inquire_order_num', 'd30_session_num',
+        ])
+
+        # 用户画像embedding维度，后期需优化
+        self.user_embedding_dimension = 98
+        self.counselor_embeddings_dimension = 171
+
+        self.counselor_embeddings = self._load_counselor_embeddings()
+        model_path = os.path.join(get_model_path(), 'ranker', 'LR_v1.1.pkl')
+        with open(model_path, 'rb') as f:
+            self.model = pickle.load(f)
+
+    def _load_counselor_embeddings(self):
+        counselor_profile_manager = CounselorProfileManager()
+        counselor_profiles = counselor_profile_manager.load_xlsx_data('all_counselor_profile.xlsx')
+        counselor_embeddings = {}
+        for _, profile in counselor_profiles.iterrows():
+            counselor_embeddings[str(profile['doctor_id'])] = encode_counselor_profile(profile)
+        return counselor_embeddings
+        
+    def get_user_profile(self, user_id):
+        if user_id == '0':
+            return None
+            
+        sql = 'SELECT {} FROM ads.ads_register_user_profiles'.format(self.select_items_str)
+        sql += ' WHERE uid={}'.format(user_id)
+        try:
+            _, all_data = self.client.query(sql)
+            if len(all_data) == 0:
+                return []
+            return all_data[0]
+        except Exception as e:
+            self.logger.error('获取用户画像数据失败', exc_info=True)
+            
+        return None
+
+    def default_user_embedding(self):
+        return [.0] * self.user_embedding_dimension
+
+    def get_user_embedding(self, user_id):
+        user_profile = self.get_user_profile(user_id)
+        if not user_profile:
+            return self.default_user_embedding()
+        
+        user_embedding = encode_user_profile(user_profile)
+        return user_embedding
+
+    def default_counselor_embedding(self):
+        return [.0] * self.counselor_embeddings_dimension
+    
+    def get_counselor_embeddings(self, counselors: List):
+        return [
+            self.counselor_embeddings.get(counselor, self.default_counselor_embedding())
+            for counselor in counselors
+        ]
+
+    def compute_score(self, user_embedding, counselor_embeddings):
+        model_input = [
+            user_embedding + counselor_embedding
+            for counselor_embedding in counselor_embeddings
+        ]
+        result = self.model.predict_proba(model_input)
+        return map(lambda x: x[1] * 100, result)
+
+    def _rank(self, counselors, user_embedding, counselor_embeddings):
+        """ 为了方便测试，精排流程拆分为两个函数 """
+        scores = self.compute_score(user_embedding, counselor_embeddings)
+
+        ret = []
+        for score, counselor in sorted(zip(scores, counselors), reverse=True):
+            ret.append({
+                'counselor': counselor,
+                'score': score,
+            })
+        return ret
+
+    def rank(self, uid, counselors):
+        counselors = counselors.strip()
+        if not counselors:
+            return []
+        
+        counselors = counselors.split(',')
+        user_embedding = self.get_user_embedding(uid)
+        counselor_embeddings = self.get_counselor_embeddings(counselors)
+        return self._rank(counselors, user_embedding, counselor_embeddings)
+
+
+if __name__ == '__main__':
+    ranker = Ranker()
+    print(ranker.rank('13577448', '17607,4792,4202,13588,11600,13242'))
+
+    print(ranker.rank('0', '17607,4792,4202,13588,11600,0'))
\ No newline at end of file
--- a/src/core/rank/test.py
+++ b/src/core/rank/test.py
+# -*- coding: utf-8 -*-
+
+import json
+import random
+from collections import Counter
+
+from ydl_ai_recommender.src.core.manager import (
+    CounselorProfileManager,
+    UserEventManager,
+)
+from ydl_ai_recommender.src.core.profile import encode_user_profile
+from ydl_ai_recommender.src.core.counselor_profile import encode_counselor_profile
+from ydl_ai_recommender.src.core.rank.ranker import Ranker
+
+
+ranker = Ranker(is_use_db=False)
+
+
+def make_data(start_date, end_date):
+    
+    # 平台用户数量太多，仅处理与本次任务相关的用户
+    manager = UserEventManager()
+    user_profiles = manager.load_xlsx_data(f'expose_click_user_{start_date}_{end_date}.xlsx')
+    user_embeddings = {}
+    for _, profile in user_profiles.iterrows():
+        user_embeddings[str(profile['uid'])] = encode_user_profile(profile)
+
+    counselor_profile_manager = CounselorProfileManager()
+    counselor_profiles = counselor_profile_manager.load_xlsx_data('all_counselor_profile.xlsx')
+
+    counselor_embeddings = {}
+    for _, profile in counselor_profiles.iterrows():
+        counselor_embeddings[str(profile['doctor_id'])] = encode_counselor_profile(profile)
+
+    # 曝光点击数据
+    test_data = []
+    expost_click = manager.load_data(start_date=start_date, end_date=end_date)
+    for _, profile in expost_click.iterrows():
+        uid = str(profile['user_ids'])
+        if ',' in uid:
+            continue
+
+        if uid not in user_embeddings:
+            u_emb = ranker.default_user_embedding()
+        else:
+            u_emb = user_embeddings[uid]
+
+        expose_ids = set(str(profile['ae_doctor_ids']).split(','))
+        click_ids = set(str(profile['doctor_card_doctor_ids']).split(',') + str(profile['private_chat_doctor_ids']).split(','))
+        uninterested_ids = list(expose_ids - click_ids)
+
+        for _id in click_ids:
+            click_id = _id
+            if _id in user_embeddings:
+                click_emb = counselor_embeddings[_id]
+            else:
+                click_emb = ranker.default_counselor_embedding()
+
+            c_embs = [click_emb]
+            c_ids = [click_id]
+
+            for _id in random.choices(uninterested_ids, k=min(19, len(uninterested_ids))):
+                if _id not in counselor_embeddings:
+                    continue
+                c_embs.append(counselor_embeddings[_id])
+                c_ids.append(_id)
+
+            if len(c_embs) < 2:
+                continue
+
+            test_data.append([
+                c_ids,
+                u_emb,
+                c_embs,
+            ])
+    print('test data row: ', len(test_data))
+    return test_data
+    
+
+def test():
+    start_date = '2023-02-01'
+    end_date = '2023-02-10'
+    test_data = make_data(start_date, end_date)
+
+    result_detail = []
+    for [c_ids, u_emb, c_embs] in test_data:
+        rank_result = ranker._rank(c_ids, u_emb, c_embs)
+
+        for i, result in enumerate(rank_result):
+            if result['counselor'] == c_ids[0]:
+                result_detail.append(i)
+                break
+        else:
+            result_detail.append(-1)
+
+    cnter = Counter(result_detail)
+    acc_detail = {}
+    for key, val in cnter.most_common():
+        acc_detail[key] = val
+        # print(key, val, val/len(result_detail))
+
+    print('==' * 20)
+    cnt = 0
+    for n in range(10):
+        if n in (1, 3, 5, 10):
+            print('top{:2}\t{:5}\t{:.2%}'.format(n, cnt, cnt / len(result_detail)))
+        cnt += acc_detail[n]
+
+
+if __name__ == '__main__':
+    test()
\ No newline at end of file
--- a/src/core/rank/train.py
+++ b/src/core/rank/train.py
+# -*- coding: utf-8 -*-
+
+import os
+import json
+import pickle
+import argparse
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+
+models = {
+    'LR': LogisticRegression(),
+    'LR_SS': make_pipeline(StandardScaler(), LogisticRegression()),
+    'MLP': MLPClassifier(),
+    'KNN': KNeighborsClassifier(),
+    'SVM': SVC(),
+    'DT': DecisionTreeClassifier(),
+    'RF': RandomForestClassifier(),
+    'Ada': AdaBoostClassifier(),
+}
+
+
+def main(args):
+    with open(os.path.join(args.data_path, 'all_train.json'), 'r') as f:
+        train_data = json.load(f)
+
+    x_train, y_train = [], []
+    for x, y in train_data:
+        x_train.append(x)
+        y_train.append(y)
+
+    model = models[args.model]
+    print('开始训练模型 ', args.model)
+    model.fit(x_train, y_train)
+    print('模型训练完成')
+
+    model_save_path = os.path.join(args.save_path, args.model + '.pkl')
+    with open(model_save_path, 'wb') as f:
+        pickle.dump(model, f)
+        print('模型已保存至 ', model_save_path)
+
+    if args.do_test:
+        with open('data/all_test.json', 'r') as f:
+            test_data = json.load(f)
+
+        x_test, y_test = [], []
+        for x, y in test_data:
+            x_test.append(x)
+            y_test.append(y)
+
+        print('{}\t{}'.format(args.model, model.score(x_test, y_test)))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='壹点灵 咨询师推荐 精排模型训练')
+    parser.add_argument('-m', '--model', type=str, default='LR', choices=list(models.keys()), help='模型类型')
+    parser.add_argument('--data_path', type=str, default='./data', help='训练数据存放目录')
+    parser.add_argument('--save_path', type=str, default='./model', help='训练好的模型存放目录')
+    parser.add_argument('--do_test', default=False, action='store_true', help='训练完成后执行测试程序')
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file