Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
Y
ydl_ai_recommender
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
闫发泽
ydl_ai_recommender
Commits
33d575d5
Commit
33d575d5
authored
May 18, 2023
by
王金柱
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
精排序
parent
6fe86fc6
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
166 additions
and
0 deletions
+166
-0
rankByXgb.py
src/core/rank/rankByXgb.py
+166
-0
No files found.
src/core/rank/rankByXgb.py
0 → 100644
View file @
33d575d5
# -*- coding:utf-8 -*-
import
time
import
requests
import
os
import
json
import
configparser
import
xgboost
as
xgb
import
pandas
as
pd
import
numpy
as
np
from
ydl_ai_recommender.src.core.profile
import
encode_user_profile
from
ydl_ai_recommender.src.utils
import
get_conf_path
,
get_data_path
,
read_user_encoder_dict
,
read_counselors
,
get_project_path
from
ydl_ai_recommender.src.utils.log
import
create_logger
logger
=
create_logger
(
__name__
,
'rankByXgb.log'
)
def
cost_time
(
desc
):
def
the_func_cost_time
(
func
):
def
fun
(
*
args
,
**
kwargs
):
t
=
time
.
perf_counter
()
result
=
func
(
*
args
,
**
kwargs
)
logger
.
info
(
'函数:{},{} 耗时:{} ms'
.
format
(
str
(
func
.
__name__
),
desc
,
round
((
time
.
perf_counter
()
-
t
)
*
1000
,
2
)))
return
result
return
fun
return
the_func_cost_time
class
RankByXGB
:
def
__init__
(
self
,
is_use_db
=
True
)
->
None
:
self
.
logger
=
create_logger
(
__name__
,
'rankByXgb.log'
)
config
=
configparser
.
RawConfigParser
()
config
.
read
(
get_conf_path
())
self
.
dmp_url
=
config
.
get
(
'DMP'
,
'url'
)
select_items
=
[
'uid'
,
'ffrom_login'
,
'user_login_city'
,
'user_preference_cate'
]
self
.
select_fields
=
{
k
:
True
for
k
in
select_items
}
self
.
user_encoder_convert
=
read_user_encoder_dict
()
self
.
all_counselors
=
read_counselors
()
self
.
params
=
{
'n_estimators'
:
150
,
'max_depth'
:
7
,
'min_child_weight'
:
5
,
'gamma'
:
0
,
'subsample'
:
0.9
,
'colsample_bytree'
:
0.5
,
'reg_alpha'
:
0
,
'reg_lambda'
:
1
,
'learning_rate'
:
0.1
,
'max_delta_step'
:
0
,
'scale_pos_weight'
:
1
}
self
.
model
=
xgb
.
XGBClassifier
(
objective
=
'binary:logistic'
,
nthread
=-
1
,
**
self
.
params
)
self
.
model
.
load_model
(
os
.
path
.
join
(
get_project_path
(),
'model_data/xgb_model.bin'
))
@cost_time
(
desc
=
'模型推荐整个流程'
)
def
rank
(
self
,
user_id
,
counselors
):
if
not
counselors
or
not
(
counselors
.
strip
()):
return
[]
counselors
=
counselors
.
strip
()
counselors
=
counselors
.
split
(
','
)
user_profile
=
self
.
get_user_profile
(
user_id
)
if
not
user_profile
:
ret
=
[]
for
counselor_id
in
counselors
:
ret
.
append
({
'counselor'
:
str
(
counselor_id
),
'score'
:
0.0
,
})
return
ret
predit_data
=
self
.
trans_feature_data
(
user_id
,
user_profile
,
counselors
)
doctor_ids
=
predit_data
.
pop
(
'doctor_id'
)
doctor_ids
=
doctor_ids
.
to_numpy
()
pre_time
=
time
.
time
()
predit_result
=
self
.
model
.
predict_proba
(
predit_data
)[:,
1
]
self
.
logger
.
info
(
'predit_time:{}ms'
.
format
(
int
((
time
.
time
()
-
pre_time
)
*
1000
)))
result_dict
=
dict
(
zip
(
doctor_ids
,
predit_result
))
result_dict
=
sorted
(
result_dict
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
recommend_data
=
[{
'counselor'
:
str
(
c_id
),
'score'
:
round
(
float
(
proba
),
4
),
}
for
(
c_id
,
proba
)
in
result_dict
]
return
recommend_data
@cost_time
(
desc
=
''
)
def
trans_feature_data
(
self
,
user_id
,
user_profile
,
counselors
):
user_feature_data
=
self
.
trans_user_feature_data
(
user_id
,
user_profile
)
counselor_feature_data
=
self
.
trans_counselor_feature_data
(
counselors
)
counselor_num
=
len
(
counselor_feature_data
)
user_feature_data_dataframe
=
pd
.
DataFrame
([
user_feature_data
]
*
counselor_num
,
columns
=
[
'ffrom_login_encoder'
\
,
'user_login_city_encoder'
,
'cate_id_1_encoder'
,
'cate_id_2_encoder'
,
'cate_id_3_encoder'
\
,
'cate_id_4_encoder'
,
'cate_id_5_encoder'
])
predit_feature_data
=
pd
.
concat
([
user_feature_data_dataframe
,
counselor_feature_data
],
axis
=
1
)
return
predit_feature_data
def
trans_user_feature_data
(
self
,
user_id
,
user_profile
):
if
not
user_profile
:
user_profile
=
self
.
get_user_profile
(
user_id
)
from_login_encoder
=
self
.
get_encoder_from_dict
(
'ffrom_login'
,
user_profile
[
'ffrom_login'
])
user_login_city_encoder
=
self
.
get_encoder_from_dict
(
'user_login_city'
,
user_profile
[
'user_login_city'
])
user_preference_cate
=
user_profile
[
'user_preference_cate'
]
user_preference_cate_top_5_encoder
=
self
.
process_user_preference_cate
(
user_preference_cate
)
user_feature_data
=
[
from_login_encoder
,
user_login_city_encoder
]
user_feature_data
.
extend
(
user_preference_cate_top_5_encoder
)
return
user_feature_data
def
trans_counselor_feature_data
(
self
,
counselor_ids
):
counselor_profiles
=
self
.
all_counselors
[
self
.
all_counselors
[
'doctor_id'
]
.
isin
(
counselor_ids
)]
.
reset_index
(
drop
=
True
)
return
counselor_profiles
@cost_time
(
desc
=
'获取用户画像'
)
def
get_user_profile
(
self
,
user_id
):
if
user_id
==
'0'
:
return
[]
headers
=
{
'X-App-Id'
:
'plough_cloud'
,
'Content-Type'
:
'application/json'
}
payload
=
{
"filter"
:
{
"uid"
:
user_id
,
},
"fields"
:
self
.
select_fields
,
"limit"
:
10
}
try
:
get_profile_time
=
time
.
time
()
response
=
requests
.
request
(
'POST'
,
self
.
dmp_url
,
headers
=
headers
,
json
=
payload
)
resp
=
response
.
json
()
return
resp
[
'data'
][
'objects'
][
0
]
except
Exception
as
e
:
self
.
logger
.
error
(
'获取用户画像数据失败:
%
s'
,
e
,
exc_info
=
True
)
try
:
self
.
logger
.
exception
(
'response json data
%
s'
,
resp
)
except
:
pass
return
[]
def
process_user_preference_cate
(
self
,
preference_cate
):
result
=
[
0
,
0
,
0
,
0
,
0
]
ids
=
[]
if
isinstance
(
preference_cate
,
str
):
pref_data
=
json
.
loads
(
preference_cate
)
for
info
in
pref_data
:
ids
.
append
(
info
[
'cate_id'
])
ids
=
ids
[
0
:
min
(
5
,
len
(
ids
))]
for
ind
,
val
in
enumerate
(
ids
):
result
[
ind
]
=
val
encoder_result
=
[]
for
ind
,
val
in
enumerate
(
result
):
value_convert_dict
=
self
.
user_encoder_convert
.
get
(
'cate_id_{}_encoder'
.
format
(
ind
+
1
))
if
value_convert_dict
is
not
None
:
encoder_result
.
append
(
value_convert_dict
.
get
(
val
,
0
))
if
len
(
encoder_result
)
<
5
:
encoder_result
.
extend
([
0
]
*
(
5
-
len
(
encoder_result
)))
return
encoder_result
def
get_encoder_from_dict
(
self
,
feature_name
,
feature_value
):
value_convert_dict
=
self
.
user_encoder_convert
.
get
(
'{}_encoder'
.
format
(
feature_name
))
if
value_convert_dict
is
None
:
return
0
return
value_convert_dict
.
get
(
str
(
feature_value
),
0
)
if
__name__
==
'__main__'
:
ranker
=
RankByXGB
()
print
(
ranker
.
rank
(
'12047'
,
'37298,21144,12019,13010,11038,2830'
))
print
(
ranker
.
rank
(
'0'
,
'37298,21144,12019,13010,11038,2830'
))
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment