python将pandas数据框,参数和函数传递给scipy.optimize.minimize
问题内容:
我正在尝试使用SciPy的scipy.optimize.minimize函数来最小化我创建的函数。但是,我尝试对其进行优化的功能本身是由其他基于熊猫DataFrame执行计算的功能构成的。
我了解SciPy的最小化函数可以通过元组输入多个参数(例如,scipy最小化函数的输入结构)。但是,我不知道如何传递依赖于熊猫DataFrame的函数。
我在下面创建了一个可复制的示例。
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy.optimize import minimize
#################### Data ####################
# Initialize dataframe.
data = pd.DataFrame({'id_i': ['AA', 'BB', 'CC', 'XX', 'DD'],
'id_j': ['ZZ', 'YY', 'XX', 'BB', 'AA'],
'y': [0.30, 0.60, 0.70, 0.45, 0.65],
'num': [1000, 2000, 1500, 1200, 1700],
'bar': [-4.0, -6.5, 1.0, -3.0, -5.5],
'mu': [-4.261140, -5.929608, 1.546283, -1.810941, -3.186412]})
data['foo_1'] = data['bar'] - 11 * norm.ppf(1/1.9)
data['foo_2'] = data['bar'] - 11 * norm.ppf(1 - (1/1.9))
# Store list of ids.
id_list = sorted(pd.unique(pd.concat([data['id_i'], data['id_j']], axis=0)))
#################### Functions ####################
# Function 1: Intermediate calculation to calculate predicted values.
def calculate_y_pred(row, delta_params, sigma_param, id_list):
# Extract the relevant values from delta_params.
delta_i = delta_params[id_list.index(row['id_i'])]
delta_j = delta_params[id_list.index(row['id_j'])]
# Calculate adjusted version of mu.
mu_adj = row['mu'] - delta_i + delta_j
# Calculate predicted value of y.
y_pred = norm.cdf(row['foo_1'], loc=mu_adj, scale=sigma_param) / \
(norm.cdf(row['foo_1'], loc=mu_adj, scale=sigma_param) +
(1 - norm.cdf(row['foo_2'], loc=mu_adj, scale=sigma_param)))
return y_pred
# Function to calculate the log-likelihood (for a row of DataFrame data).
def loglik_row(row, delta_params, sigma_param, id_list):
# Calculate the log-likelihood for this row.
y_pred = calculate_y_pred(row, delta_params, sigma_param, id_list)
y_obs = row['y']
n = row['num']
loglik_row = np.log(norm.pdf(((y_obs - y_pred) * np.sqrt(n)) / np.sqrt(y_pred * (1-y_pred))) /
np.sqrt(y_pred * (1-y_pred) / n))
return loglik_row
# Function to calculate the sum of the negative log-likelihood.
# This function is called via SciPy's minimize function.
def loglik_total(data, id_list, params):
# Extract parameters.
delta_params = list(params[0:len(id_list)])
sigma_param = init_params[-1]
# Calculate the negative log-likelihood for every row in data and sum the values.
loglik_total = -np.sum( data.apply(lambda row: loglik_row(row, delta_params, sigma_param, id_list), axis=1) )
return loglik_total
#################### Optimize ####################
# Provide initial parameter guesses.
delta_params = [0 for id in id_list]
sigma_param = 11
init_params = tuple(delta_params + [sigma_param])
# Maximize the log likelihood (minimize the negative log likelihood).
minimize(fun=loglik_total, x0=init_params,
args=(data, id_list), method='nelder-mead')
这将导致以下错误:(AttributeError: 'numpy.ndarray' object has no attribute 'apply'
整个错误输出在下面)。我相信这个错误是因为minimize
将其X
视为numpy数组,而我想将其作为pandas
DataFrame传递。
AttributeError: 'numpy.ndarray' object has no attribute 'apply'
AttributeErrorTraceback (most recent call last)
<ipython-input-93-9a5866bd626e> in <module>()
1 minimize(fun=loglik_total, x0=init_params,
----> 2 args=(data, id_list), method='nelder-mead')
/Users/adam/anaconda/lib/python2.7/site-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
436 callback=callback, **options)
437 elif meth == 'nelder-mead':
--> 438 return _minimize_neldermead(fun, x0, args, callback, **options)
439 elif meth == 'powell':
440 return _minimize_powell(fun, x0, args, callback, **options)
/Users/adam/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_neldermead(func, x0, args, callback, maxiter, maxfev, disp, return_all, initial_simplex, xatol, fatol, **unknown_options)
515
516 for k in range(N + 1):
--> 517 fsim[k] = func(sim[k])
518
519 ind = numpy.argsort(fsim)
/Users/adam/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in function_wrapper(*wrapper_args)
290 def function_wrapper(*wrapper_args):
291 ncalls[0] += 1
--> 292 return function(*(wrapper_args + args))
293
294 return ncalls, function_wrapper
<ipython-input-69-546e169fc54e> in loglik_total(data, id_list, params)
6
7 # Calculate the negative log-likelihood for every row in data and sum the values.
----> 8 loglik_total = -np.sum( data.apply(lambda row: loglik_row(row, delta_params, sigma_param, id_list), axis=1) )
9
10 return loglik_total
AttributeError: 'numpy.ndarray' object has no attribute 'apply'
在SciPy的函数中处理DataFramedata
并调用我的函数的正确方法是什么?任何建议都欢迎,我们将不胜感激。loglik_total``minimize
可能的解决方案: 注意,我考虑过可以编辑函数以将其data
视为numpy数组,而不是pandas
DataFrame。但是,由于以下两个原因,我想避免这种情况:1)中loglik_total
,我使用pandasapply
函数将loglik_row
函数应用于的每一行data
;2)data
通过其列名而不是数字索引来引用列很方便。
问题答案:
数据格式不是问题,但是您loglik_total
以错误的方式调用了。下面是修改后的版本,以正确的参数顺序(params
已先走,然后你通过在同一顺序的附加参数,如args
您的minimize
通话):
def loglik_total(params, data, id_list):
# Extract parameters.
delta_params = list(params[0:len(id_list)])
sigma_param = params[-1]
# Calculate the negative log-likelihood for every row in data and sum the values.
lt = -np.sum( data.apply(lambda row: loglik_row(row, delta_params, sigma_param, id_list), axis=1) )
return lt
如果您再致电
res = minimize(fun=loglik_total, x0=init_params,
args=(data, id_list), method='nelder-mead')
它贯穿很好(注意顺序x
,data
,id_list
,一样的,你传递给loglik_total
)和res
如下所示:
final_simplex: (array([[ 2.55758096e+05, 6.99890451e+04, -1.41860117e+05,
3.88586258e+05, 3.19488400e+05, 4.90209168e+04,
6.43380010e+04, -1.85436851e+09],
[ 2.55758096e+05, 6.99890451e+04, -1.41860117e+05,
3.88586258e+05, 3.19488400e+05, 4.90209168e+04,
6.43380010e+04, -1.85436851e+09],
[ 2.55758096e+05, 6.99890451e+04, -1.41860117e+05,
3.88586258e+05, 3.19488400e+05, 4.90209168e+04,
6.43380010e+04, -1.85436851e+09],
[ 2.55758096e+05, 6.99890451e+04, -1.41860117e+05,
3.88586258e+05, 3.19488400e+05, 4.90209168e+04,
6.43380010e+04, -1.85436851e+09],
[ 2.55758096e+05, 6.99890451e+04, -1.41860117e+05,
3.88586258e+05, 3.19488400e+05, 4.90209168e+04,
6.43380010e+04, -1.85436851e+09],
[ 2.55758096e+05, 6.99890451e+04, -1.41860117e+05,
3.88586258e+05, 3.19488400e+05, 4.90209168e+04,
6.43380010e+04, -1.85436851e+09],
[ 2.55758096e+05, 6.99890451e+04, -1.41860117e+05,
3.88586258e+05, 3.19488400e+05, 4.90209168e+04,
6.43380010e+04, -1.85436851e+09],
[ 2.55758096e+05, 6.99890451e+04, -1.41860117e+05,
3.88586258e+05, 3.19488400e+05, 4.90209168e+04,
6.43380010e+04, -1.85436851e+09],
[ 2.55758096e+05, 6.99890451e+04, -1.41860117e+05,
3.88586258e+05, 3.19488400e+05, 4.90209168e+04,
6.43380010e+04, -1.85436851e+09]]), array([-0., -0., -0., -0., -0., -0., -0., -0., -0.]))
fun: -0.0
message: 'Optimization terminated successfully.'
nfev: 930
nit: 377
status: 0
success: True
x: array([ 2.55758096e+05, 6.99890451e+04, -1.41860117e+05,
3.88586258e+05, 3.19488400e+05, 4.90209168e+04,
6.43380010e+04, -1.85436851e+09])
这个输出是否有意义,我无法判断:)