Commit 20d378e5 by 柴鹏飞

增加咨询师画像

parent b0845767
......@@ -7,7 +7,9 @@ from datetime import datetime
from ydl_ai_recommender.src.core.manager import (
OrderDataManager,
ChatDataManager,
ProfileManager,
UserProfileManager,
CounselorProfileManager,
UserEventManager,
)
from ydl_ai_recommender.src.core.indexer import (
UserCounselorDefaultIndexer,
......@@ -26,7 +28,11 @@ logger = create_logger(__name__, 'update.log')
parser = argparse.ArgumentParser(description='壹点灵 咨询师推荐 算法召回 离线更新数据模型')
parser.add_argument(
'-t', '--task', type=str, required=True,
choices=('load_db_data', 'make_embedding', 'make_index'), help='执行任务名称'
choices=('load_db_data', 'make_embedding', 'make_index', 'load_user_event_data'), help='执行任务名称'
)
parser.add_argument(
'--only_update', default=False, action='store_true', help='从数据库导出数据到本地是否覆盖原数据,默认不覆盖'
)
parser.add_argument('--index_last_date', default=None, type=str, help='构建索引最后日期,超过该日期的数据不使用')
......@@ -61,15 +67,17 @@ if __name__ == '__main__':
logger.info('')
if args.task == 'load_db_data':
initialize_dir()
if args.only_update is False:
initialize_dir()
logger.info('开始从数据库中更新数据')
client = MySQLClient.create_from_config_file(get_conf_path())
managers = [
['画像数据', ProfileManager(client)],
['画像数据', UserProfileManager(client)],
['订单数据', OrderDataManager(client)],
['询单数据', ChatDataManager(client)],
['咨询师画像', CounselorProfileManager(client)],
]
for [name, manager] in managers:
......@@ -84,9 +92,15 @@ if __name__ == '__main__':
logger.info('')
logger.info('--' * 50)
logger.info('开始构建用户特征 embedding')
manager = ProfileManager()
manager = UserProfileManager()
manager.make_embeddings()
logger.info('用户特征 embedding 构建完成')
logger.info('--' * 50)
logger.info('开始构建咨询师特征 embedding')
manager = CounselorProfileManager()
manager.make_embeddings()
logger.info('咨询师特征 embedding 构建完成')
if args.task == 'make_index':
......@@ -108,10 +122,13 @@ if __name__ == '__main__':
logger.info('所有索引更新数据完成')
# if args.task == 'make_virtual_embedding':
# logger.info('')
# logger.info('开始构建用户特征虚拟embedding')
# manager = ProfileManager()
# manager.make_virtual_embedding()
# logger.info('用户特征虚拟 embedding 构建完成')
\ No newline at end of file
if args.task == 'load_user_event_data':
logger.info('')
logger.info('开始下载用户埋点数据')
client = MySQLClient.create_from_config_file(get_conf_path())
manager = UserEventManager(client)
# 训练数据
manager.update_data(start_date='2023-01-01', end_date='2023-02-01')
# 测试数据
manager.update_data(start_date='2023-02-01', end_date='2023-02-10')
logger.info('用户特征虚拟 embedding 构建完成')
\ No newline at end of file
# -*- coding: utf-8 -*-
import json
from typing import Dict, List, Any, Union
import pandas as pd
from .profile import BaseProfile
from .profile import CityProfile, NumClassProfile
class GenderProfile(BaseProfile):
def __init__(self) -> None:
super().__init__()
self.dim = 2
def convert(self, value):
try:
value = int(value)
except Exception:
return [0, 0]
if value == 1:
return [1, 0]
elif value == 2:
return [0, 1]
else:
return [0, 0]
def inconvert(self, embedding):
if embedding[0] == 1:
return 'male'
elif embedding[1] == 2:
return 'feminine'
else:
return 'unknown_sex'
class NumScaleProfile(BaseProfile):
def __init__(self, min_val=0, scale=1) -> None:
super().__init__()
self.dim = 1
self.min_val = min_val
self.scale = scale
def convert(self, value):
try:
value = int(value)
except Exception:
return [0]
value -= self.min_val
value = min(value, self.scale)
value = max(value, 0)
return [value / self.scale]
def inconvert(self, embedding: List[float]) -> str:
if embedding[0] == 0:
return 'unknown'
else:
return str(int(embedding[0] * self.scale))
class CertificationProfile(BaseProfile):
def __init__(self) -> None:
super().__init__()
self.dim = 4
# 这4类证书占比约95%
self.certification_names = ['二级心理咨询师', '三级心理咨询师', '中级心理咨询师', '社会心理师']
def convert(self, value):
ret = [0, 0, 0, 0]
try:
index = self.certification_names.index(value)
ret[index] = 1
except ValueError:
pass
return ret
def inconvert(self, embedding):
for idx, val in enumerate(embedding):
if val == 1:
return self.certification_names[idx]
return '其他证书'
class EducationProfile(BaseProfile):
def __init__(self) -> None:
super().__init__()
self.dim = 6
def convert(self, value):
ret = [0, 0, 0, 0, 0, 0]
try:
value = int(value)
except Exception:
return ret
if 3 <= value <= 8:
ret[value - 3] = 1
return ret
def inconvert(self, embedding):
for idx, val in enumerate(embedding):
if val == 1:
return str(idx + 3)
return 'unknown_education'
class OrderCateProfile(BaseProfile):
"""
order_tag_sum 订单标签类目
"""
def __init__(self) -> None:
super().__init__()
self.dim = 10
self.cate_list = ['恋爱情感', '情绪压力', '婚姻家庭', '个人成长', '亲子教育', '人际关系', '职场发展', '心理健康', '人际社交', '两性心理',]
def convert(self, value):
ret = [0] * self.dim
try:
value = json.loads('[' + value + ']')
except Exception:
return ret
for item in value:
try:
i = self.cate_list.index(item['cate_name'])
ret[i] = 1.0
except:
pass
return ret
def inconvert(self, embedding):
ret = []
for idx, val in enumerate(embedding):
if val == 1.0:
ret.append(self.cate_list[idx])
return ret
class TagGoodAtDirectionProfile(BaseProfile):
"""
tag_good_at_direction 咨询师擅长方向标签
"""
def __init__(self) -> None:
super().__init__()
self.dim = 10
self.cate_list = ['情绪压力', '亲子教育', '恋爱情感', '婚姻家庭', '心理健康', '个人成长', '人际关系', '职场发展', '人际社交', '两性心理',]
def convert(self, value):
ret = [0] * self.dim
try:
value = json.loads(value)
except Exception:
return ret
for item in value:
try:
i = self.cate_list.index(item['cate_name'])
ret[i] = 1.0
except:
pass
return ret
def inconvert(self, embedding):
ret = []
for idx, val in enumerate(embedding):
if val == 1.0:
ret.append(self.cate_list[idx])
return ret
class TagGoodAtSubclassProfile(BaseProfile):
"""
tag_good_at_** 咨询师擅长标签,包括:
咨询师擅长群体标签
咨询师咨询风格标签
咨询师擅长服务标签
咨询师语言能力标签
咨询师咨询流派标签
"""
def __init__(self, cate_list) -> None:
super().__init__()
self.dim = len(cate_list)
self.cate_list = cate_list
def convert(self, value):
ret = [0] * self.dim
try:
value = json.loads(value)
except Exception:
return ret
for item in value:
try:
i = self.cate_list.index(item['tag_name'])
ret[i] = 1.0
except:
pass
return ret
def inconvert(self, embedding):
ret = []
for idx, val in enumerate(embedding):
if val == 1.0:
ret.append(self.cate_list[idx])
return ret
good_at_group_tag_list = [
'青少年', '职场人', '家庭关系', '大学生', '伴侣', '夫妻', '中学生', '公务员', '留学生',
'精神康复者', '小学生', '孕产妇', '老年人', '成年人', '中年人',
]
good_at_style_tag_list = [
'温暖', '温和而真诚', '耐心', '专业', '抱持', '赋能', '温和而坚定', '深刻而有力', '沉稳',
'真实', '深入', '富于创造性', '清晰', '简洁', '接纳包容', '真诚', '敏锐专业', '温和', '坚定',
]
good_at_service_tag_list = [
'情绪疏导', '关系改善', '情绪管理', '情感分析', '情感陪伴', '心理分析', '动力提升', '创伤修复',
'恋爱指导', '行为矫正', '离婚指导', '认知调整', '心态调整', '经验指导', '身心减压', '性格改善',
'失眠改善', '绘画分析', '情感修复', '梦境解析', '习惯养成', '危机消除',
]
good_at_language_tag_list = [
'普通话', '英语', '粤语', '四川话', '江西方言', '闽南语', '日语', '韩语', '德语', '俄语',
'西班牙语', '法语', '意大利语',
]
good_at_school_tag_list = [
'认知行为疗法', '叙事疗法', '整合疗法', '精神分析', '焦点解决短期疗法', '人本主义治疗',
'婚姻与家庭治疗', '催眠疗法', '森田疗法', '格式塔疗法', '心理动力学疗法', '沙盘治疗', '其他疗法',
'表达性艺术疗法', '接纳承诺疗法', '心理危机干预', '辩证行为疗法', '游戏治疗', '情感关注疗法',
]
profile_converters = [
['gender', GenderProfile()],
['age', NumScaleProfile(0, 100)],
['certification', CertificationProfile()],
['education', EducationProfile()],
['city', CityProfile(level=4)],
['work_years', NumScaleProfile(0, 20)],
['product_min_price', NumScaleProfile(10, 1200-10)],
['order_cate_sum', OrderCateProfile()],
['tag_good_at_direction', TagGoodAtDirectionProfile()],
['tag_good_at_group', TagGoodAtSubclassProfile(good_at_group_tag_list)],
['tag_good_at_style', TagGoodAtSubclassProfile(good_at_style_tag_list)],
['tag_good_at_service', TagGoodAtSubclassProfile(good_at_service_tag_list)],
['tag_good_at_language', TagGoodAtSubclassProfile(good_at_language_tag_list)],
['tag_good_at_school', TagGoodAtSubclassProfile(good_at_school_tag_list)],
]
def encode_counselor_profile(profile):
"""
将咨询师画像信息转换为向量
"""
embedding = []
for [name, converter] in profile_converters:
embedding.extend(converter.convert(profile[name]))
return embedding
def decode_counselor_profile(embedding):
"""
向量转换为咨询师画像
"""
ret = {}
si = 0
for [name, converter] in profile_converters:
ei = si + converter.dim
ret[name] = converter.inconvert(embedding[si: ei])
si = ei
return ret
\ No newline at end of file
......@@ -3,6 +3,8 @@
from .manager import Manager
from .database_manager import DatabaseDataManager
from .profile_manager import ProfileManager
from .profile_manager import UserProfileManager
from .profile_manager import CounselorProfileManager
from .chat_data_manager import ChatDataManager
from .order_data_manager import OrderDataManager
\ No newline at end of file
from .order_data_manager import OrderDataManager
from .user_event_manager import UserEventManager
\ No newline at end of file
......@@ -6,18 +6,19 @@ from typing import List
import pandas as pd
from ydl_ai_recommender.src.core.profile import encode_profile
from ydl_ai_recommender.src.core.profile import encode_user_profile
from ydl_ai_recommender.src.core.counselor_profile import encode_counselor_profile
from ydl_ai_recommender.src.core.manager import DatabaseDataManager
from ydl_ai_recommender.src.utils.log import create_logger
class ProfileManager(DatabaseDataManager):
class UserProfileManager(DatabaseDataManager):
"""
订单用户画像数据管理
"""
def __init__(self, client=None) -> None:
super().__init__(client, create_logger(__name__, 'profile_manager.log'))
super().__init__(client, create_logger(__name__, 'user_profile_manager.log'))
self.select_items_str = ', '.join([
'uid', 'country_code', 'channel_id_type', 'ffrom_login', 'user_preference_cate', 'consult_pay_money',
'listen_pay_money', 'test_items_pay_money', 'course_pay_money', 'consult_order_num',
......@@ -67,7 +68,7 @@ class ProfileManager(DatabaseDataManager):
self.logger.info('开始构建订单用户的用户画像向量')
for _, profile in user_profiles.iterrows():
user_ids.append(str(profile['uid']))
embeddings.append(encode_profile(profile))
embeddings.append(encode_user_profile(profile))
self.logger.info('用户画像向量构建完成,共构建 %s 用户', len(user_ids))
......@@ -109,7 +110,62 @@ class ProfileManager(DatabaseDataManager):
json.dump(v_embedding_list, f, ensure_ascii=False)
class CounselorProfileManager(DatabaseDataManager):
"""
咨询师画像数据管理
"""
def __init__(self, client=None) -> None:
super().__init__(client, create_logger(__name__, 'counselor_profile_manager.log'))
self.select_items_str = ', '.join([
'doctor_id',
'uid', 'gender', 'age', 'certification', 'education', 'city', 'work_years', 'product_min_price',
'order_tag_sum', 'order_cate_sum', 'tag_good_at_direction', 'tag_good_at_group', 'tag_good_at_style',
'tag_good_at_service', 'tag_good_at_language', 'tag_good_at_school', 'cate_good_at',
])
def _make_query_sql(self):
sql = 'SELECT {} FROM ads.ads_doctor_stats'.format(self.select_items_str)
sql += ' WHERE is_deleted = 0'
return sql
def update_data(self):
""" 从数据库中拉取最新画像特征并保存 """
sql = self._make_query_sql()
_, all_data = self.fetch_data_from_db(sql)
df = pd.DataFrame(all_data)
self.save_xlsx_data(df, 'all_counselor_profile.xlsx')
def _load_profile_data(self):
return self.load_xlsx_data('all_counselor_profile.xlsx')
def make_embeddings(self):
counselor_profiles = self._load_profile_data()
self.logger.info('咨询师画像数据加载完成,共加载 %s 条', len(counselor_profiles))
user_ids, embeddings = [], []
self.logger.info('开始构建咨询师画像向量')
for _, profile in counselor_profiles.iterrows():
user_ids.append(str(profile['uid']))
embeddings.append(encode_counselor_profile(profile))
self.logger.info('咨询师画像向量构建完成,共构建 %s 咨询师', len(user_ids))
with open(os.path.join(self.local_file_dir, 'counselor_embeddings_ids.txt'), 'w', encoding='utf-8') as f:
f.write('\n'.join(user_ids))
with open(os.path.join(self.local_file_dir, 'counselor_embeddings.json'), 'w', encoding='utf-8') as f:
json.dump(embeddings, f, ensure_ascii=False)
return embeddings
if __name__ == '__main__':
manager = ProfileManager()
# manager = UserProfileManager()
# manager.make_embeddings()
manager.make_virtual_embedding()
\ No newline at end of file
# manager.make_virtual_embedding()
manager = CounselorProfileManager()
manager.update_data()
manager.make_embeddings()
\ No newline at end of file
......@@ -271,7 +271,7 @@ class CityProfile(BaseProfile):
def __init__(self, level=2) -> None:
"""
level: 级别,2-省/直辖市 ; 3-区; 4-区;6-投递区
level: 级别,2-省/直辖市 ;4-区;6-投递区
"""
super().__init__()
......@@ -359,7 +359,7 @@ profile_converters = [
]
def encode_profile(profile):
def encode_user_profile(profile):
"""
将用户画像信息转换为向量
"""
......@@ -369,7 +369,7 @@ def encode_profile(profile):
return embedding
def decode_profile(embedding):
def decode_user_profile(embedding):
"""
向量转换为用户画像
"""
......
......@@ -14,7 +14,7 @@ from ydl_ai_recommender.src.core.indexer import (
UserCounselorCombinationIndexer,
CounselorCounselorCFIndexer,
)
from ydl_ai_recommender.src.core.profile import encode_profile
from ydl_ai_recommender.src.core.profile import encode_user_profile
from ydl_ai_recommender.src.data.mysql_client import MySQLClientPool
from ydl_ai_recommender.src.utils import get_conf_path, get_data_path
from ydl_ai_recommender.src.utils.log import create_logger
......@@ -151,7 +151,7 @@ class UserCFRecommender(Recommender):
def recommend_with_profile(self, user_profile, size=0, is_merge=True):
user_embedding = encode_profile(user_profile)
user_embedding = encode_user_profile(user_profile)
counselors = self._recommend(user_embedding)
# size == 0 时,不追加默认推荐咨询师
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment