您的位置:首页 > 编程语言 > Java开发

java多元线性回归

2016-06-21 16:43 477 查看
package  com.lc.v3.scm.datamining.math;

/*************************************************************************************
* @author
*
*
*
*
******************************************************************************/

/***********************************************************************************
* Desc: The regression analyse is a widely used statistices  method  ,It was always
* used for forecast,cause and effect analyse  and so on.
* --统计分析-〉回归分析,一般分为线形回归和非线性回归
* All of us familiar with the line regression.In fact,this method can extend to non
* linear regression ,But the model built based on the analyser's private experience
* and his knowledge of practice.
* ************************************************************************************/

public  class  Regression
{
public Regression()
{

}

/***************************************************************************************
*模型:多元线性回归
*简要说明:根据一个样本集,计算线性回归分析系数。线性回归分析的模型为:Y=k0+k1X1+k2X2+....+knXn,其中,X1,X2,....,Xn为因变量,Y为变量
*     k0,k1,....,kn为回归系数,该函数就是要根据一组样本(Xi1,Xi2,...Xin,Yi)i=1,2,...,m[m个样本],根据最小二乘法得原则,计算出
*     最佳得回归分析系数k0,k1,....,kn,从而得到线性回归分析模型,该模型稍加扩展,就可以推广到非线性回归模型
*输入参数:
*    @param  double[][]  X  自变量样本集
*    @param  double[]    Y  变量结果集
*    @param  double[]   K  回归系数
*    @param  int  n  回归变量个数
*    @param  int  m  样本个数
*输出参数:
*    @return double result  0:失败,其他:成功
****************************************************************************************/
public static double  LineRegression(
double[][] X,
double[] Y,
double[] K,
int n,
int m
)
{
double result = 0 ;

/*
*线性回归问题,最终转换为解一个对称线性方程组的求解问题
*线性方程组的系数矩阵为n+1*n+1,常数矩阵为n+1*1
*/
int  XLen = n+1;
int  YLen = 1;
int i,j,k;
double[][]  coeffX = new double[XLen][XLen];
double[][]  constY = new double[XLen][1];
double[][]  resultK = new double[XLen][1];

/*
*根据参数,计算所要求解方程组的系数矩阵、常数矩阵
*/
double[][] temp = new double[m+1][n+1];
for(i =0;i<n+1;i++)
{
temp[0][i] = 1;
}
for(i =0;i<m+1;i++)
{
temp[i][0] = 1;
}
for( i=1;i<m+1;i++)
for( j=1;j<n+1;j++)
temp[i][j]= X[i-1][j-1];
/*
*开始计算每一个系数
*/
for(i=0;i<n+1;i++)
{
/*
*coeffX的第i行和i列的系数,注意,是对称矩阵
*/
for( j= i;j<n+1;j++){
double col = 0 ;
for(k=1;k<m+1;k++)
col+= ( temp[k][i]*temp[k][j] );
coeffX[i][j] = col;
coeffX[j][i] = col;
}

/*
*constY的第i个元素
*/
double conTemp =0 ;
for(k=1;k<m+1;k++)
conTemp+= ( Y[k-1]*temp[k][i]);
constY[i][0]=conTemp;

}

/*
*调用Sequation方法,解线性方程组
*/
result = Sequation.guassEquation(coeffX,constY,resultK,XLen,1);
if(result ==0 )
{
//System.out.println("The regression is failed,please check the sample point \n");
return result;
}else{
for(i= 0;i<n+1;i++)
K[i] = resultK[i][0];
}
return result;
}

/*****************************************************************************************
*模型名称:样本自优化线性回归
*简要说明:该模型是对简单线性回归模型的改良,其基本的考虑是所提供的样本数据中可能有些异常点使得模型的精度大大降低,系统通过一个度量函数自动
*         找那种对模型的拟合度最差的样本点,然后去掉该样本点再进行回归运算.本模型用实际值与模型预测值之间的平均偏差作为度量模型准确性的效
*         用函数。
*         如果平均偏差减少,标识模型在去掉噪声数据后模型拟合的效果增加,否则停止继续优化的步骤。另外,无论如何优化,都必须至少保留样本保有率
*         所确定的最少样本个数
*注:该模型尚未得到理论论证,是作者自己的经验总结
*函数参数:
*    @param  double[][]  X  自变量样本集
*    @param  double[]    Y  变量结果集
*    @param  int  n  回归变量个数
*    @param  double[]   K  最终回归系数(返回)
*    @param  int  m  样本个数
*    @param   double retainRate  样本最低保有率
*    @param  int[] LossPoint 被丢弃的样本点(返回)
*输出参数:
*    @return double res  -1:失败,1:成功
*******************************************************************************************/
public static  double optLineRegression( double[][] X,
double[] Y,
double[] K,
int n,
int m,
double retainRate ,
int[] LossPoint){

double res = -1;
if(n<1||m<1){
//System.out.println("The parameter is not normal ,please check it\n");
return res;
}
if(retainRate>=1.0||retainRate<=0.0){
//System.out.println("The retain parameter is not in 0 and 1 \n");
return res;
}

//必须确保的最小样本个数

Double Dtemp = new Double(m*retainRate);
int minsample = Dtemp.intValue();

//被丢弃的样本点的个数

int lossnum =0 ;

int[] LossPointTemp = new int[m];

//进行第一次回归

double temp = LineRegression(X,Y, K,n,m);
if(temp == 0){
//System.out.println("The regression operation is failed\n");
return res;
}

//第一次的平均误差

double ErrorStd = avgerror(X, Y, K,n, m);

double[][] SampleX = X;
double[]   SampleY = Y;
double[]    CoeffK = K;
int SampleNum = m;

/*
*记载样本位置的变化情况
*/
int[] change = new int[m];
for(int k=0;k<m;k++)
change[k]=k;
int index_max = -1;

for (int i=m;i>minsample ;i-- )
{
/*
*找当前回归样本中的误差最大的样本index。
*/
index_max = maxErrorIndex(SampleX,SampleY,CoeffK,n,SampleNum);

if(index_max == -1)
return -1;

/*
*第index_max为误差最大样本,去掉该样本
*/

int Loss = change[index_max];
lossnum +=1;
SampleNum -=1;

double[][] SampleXTemp = SampleX;
SampleX = new double[SampleN
4000
um]
;

double[] SampleYTemp = SampleY;
SampleY  = new double[SampleNum];

for(int j= 0;j<index_max;j++){
for(int k=0;k<n;k++)
{
SampleX[j][k]=SampleXTemp[j+1][k];
}
SampleY[j]=SampleYTemp[j];
}

for(int j= index_max;j<SampleNum;j++){
for(int k=0;k<n;k++){
SampleX[j][k]=SampleXTemp[j+1][k];
}
SampleY[j] = SampleYTemp[j+1];
change[j] = change[j+1];

}

/*
*利用新的样本进行回归
*/
res=LineRegression(SampleX,SampleY,CoeffK,n,SampleNum);

/*
*比较新的预测模型误差与老模型的误差,如果没有改良,结束,否则,记载相关结果,并继续
*/

double  ErrorOpt = avgerror(SampleX,SampleY,CoeffK,n,SampleNum);

if (ErrorOpt>=ErrorStd)//优化过程没有任何改良
{
return  1;
}
else
{

/*
*记载样本删减情况,记载回归系数
*/
LossPoint[m-i] = Loss;

K=CoeffK;

}
}
return 1;
}

/************************************************************************************
*简要说明:计算回归模型的平均误差
*输入参数:
*    @param  double[][]  X  自变量样本集
*    @param  double[]    Y  变量结果集
*    @param  int  n  回归变量个数
*    @param  double[]   K  回归系数
*    @param  int  m  样本个数
*输出参数:
*    @return double Ierror :-1失败 平均误差
*************************************************************************************/
public static double avgerror(double[][] X,double[] Y,double[] K,int n,int m){
double res = -1;
/*
*YF用于存放模型的预测结果
*/
double[] YF = new double[m];
for(int i=0;i<m;i++){
YF[i] = K[0];
for(int j=0;j<n;j++)
YF[i]+=X[i][j]*K[j+1];
}

/*
*计算初始预测与真实值的误差(平均维和距)
*/
res = Base.dimaddavg(Y,YF);
return res;
}

/************************************************************************************
*简要说明:计算回归模型的误差最大的样本点的index_id
*输入参数:
*    @param  double[][]  X  自变量样本集
*    @param  double[]    Y  变量结果集
*    @param  int  n  回归变量个数
*    @param  double[]   K  回归系数
*    @param  int  m  样本个数
*输出参数:
*    @return int Index_id :-1失败
************************************************************************************/
public static int maxErrorIndex(double[][] X,double[] Y,double[] K,int n,int m){
int index_id = -1;
/*
*YF用于存放模型的预测结果
*/
double[] YF = new double[m];
for(int i=0;i<m;i++){
YF[i] = K[0];
for(int j=0;j<n;j++)
YF[i]+=X[i][j]*K[j+1];
}

/*
*计算误差最大的样本点的index
*/
index_id = Base.maxerrordim(Y,YF);
return index_id;
}

}

package com.lc.v3.scm.datamining.math;

public class  Base
{
public Base(){

}
/*
*Desc:get the max value for a  nonmultidimensional array
*/
public static double max( double[] sample){
int n = sample.length;
double result =sample[0];
for (int i=1;i<n ;i++ )
{
if (sample[i]>result)
{
result=sample[i];
}
}
return result;
}

/*
*Desc:get the min value for a nonmultidimensional array
**/
public static double min( double[] sample){
int n = sample.length;
double result =sample[0];
for (int i=1;i<n ;i++ )
{
if (sample[i]<result)
{
result=sample[i];
}
}
return result;
}

/*
*Desc:get the average value  for a nonmultidimensional array
*/
public static double avg( double[] sample){
int n = sample.length;
double result =sample[0];
for (int i=1;i<n ;i++ )
{
result +=sample[i];
}
return result/n;
}

/*
*Desc: get the variance (方差)for a nonmultidimensional array
* (Inpute)输入参数:
*     @param double[] sample 样本数组
* (OutPute)输出参数:
*     @return double if ==-1,样本为空
* (Logic)计算逻辑:
*     sample - 平均值的平方求和
**/
public static double val( double[] sample){
int n = sample.length;
double result = -1;
if (n<=0)
{
return result ;
}else{
double avg = avg(sample);
result = 0;
for(int j=0;j<n;j++)
result += (sample[j]-avg)*(sample[j]-avg);

}
result/=n;
return result;
}

/*
*Desc: get the standard variance(标准方差) for a nonmultidimensional array
* (Inpute)输入参数:
*     @param double[] sample 样本数组
* (Output)输出参数:
*     @return double if ==-1,样本为空
* (Logic)计算逻辑:
*     sample[]的方差开方
**/
public static double stdval (double[] sample){
int n = sample.length;
double result = -1;
if(n<=0){
return result;
}else {
result = val(sample);
result = Math.sqrt(result);
}
return result;
}

/*
*Desc: get the max distance  for a nonmultidimensional array
* (Inpute)输入参数:
*     @param double[] sample 样本数组
* (Outpute)输出参数:
*     @return double if ==-1,样本为空
* (Logic)计算逻辑:
*     The array of sample 's max value subtract Its
*     min value [sample[]的最大值-最小值]
**/
public static double maxdis (double[] sample){
int n = sample.length;
double result = -1;
if(n<=0){
return result;
}else {
result = max(sample)-min(sample);
}
return result;
}

/*
*Desc: get the average warp for a nonmultidimensional array
* (Inpute)输入参数:
*     @param double[] sample 样本数组
* (Outpute)输出参数:
*     @return double if ==-1,样本为空
* (Logic)计算逻辑:
*     sample[]的每一个元素-平均值的绝对值的平均值
**/
public static double  meddev (double[] sample){
int n = sample.length;
double result = -1;
if(n<=0){
return result;
}else {
double avg = avg(sample);
result =0;
for(int j=0;j<n;j++){
result += Math.abs(sample[j]-avg);
}
}
result /=n;
return result;
}

/*
*Desc: get the inside multiplication of  two array
*(Inpute)输入参数:
*     @param double[] sample1
*     @param double[] sample2
*(OutPute)输出参数:
*     @return double res;
*(Logic)处理逻辑:
*   Sum the value of two array's corresponding element production [对应元素乘积的和]
**/

public static double inmul(double[] sample1,double[] sample2){
double result = 1.0/0.0;
int len1 = sample1.length;
int len2 = sample2.length;
if(len1 != len2){
System.out.println("two vector dose not at the same dimension,please check it\n");
return result ;
}else{
result = 0;
for (int i=0;i<len1 ;i++ )
{
result += sample1[i]*sample2[i];
}

}
return result;
}

/*
*Desc : get the opposite production of two array to comprise a new array
*(Inpute)输入参数:
*     @param double[] sample1
*     @param double[] sample2
*(Outpute)输出参数:
*     @return double[] result;
*(Logic)处理逻辑:
*   形成新的向量,元素为对应元素的乘积
**/

public static double[] vecmul(double[] sample1,double[] sample2){
int len1 = sample1.length;
int len2 = sample2.length;
double[] result = new double[len1];
if(len1 != len2){
System.out.println("two vector dose not at the same dimension,please check it\n");
return result ;
}else{
for (int i=0;i<len1 ;i++ )
{
result[i]= sample1[i]*sample2[i];
}

}
return result;
}

/*
*Desc : sort the element of the array into a from  great to little order
*/
public  void  decorder(double[] sample){
int n = sample.length;
for(int i=0;i<n-1;i++){
for(int j=i+1;j<n;j++){
if (sample[i]<sample[j])
{
double temp = sample[i];
sample[i]=sample[j];
sample[j]=sample[i];
}
}
}
}

/*
*Desc : sort the element of the array into a from  little to great order
*/
public  void  ascorder(double[] sample){
int n = sample.length;
for(int i=0;i<n-1;i++){
for(int j=i+1;j<n;j++){
if (sample[i]>sample[j])
{
double temp = sample[i];
sample[i]=sample[j];
sample[j]=sample[i];
}
}
}
}

/*
*Desc: get two point distance of standard (Euclid distance)
*Inpute:
*     @param  double[] Pfrom
*     @param  double[] Pto
* Outpute:
*     @param  double res
* Logic:
*/
public static double eudis(double[] Pfrom ,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
{
return -1;
}
double res = 0;
for(int i=0;i<Lfrom;i++){
double temp = Pfrom[i]-Pto[i];
temp*=temp;
res+=temp;
}
return Math.sqrt(res);
}

/*
*Desc : get the min value of two array's corresponding element's distance
*Inpute:
*    @param  double[] Pfrom
*    @parma  double[] Pto
*Outpute:
*    @return double res
*Logic:
*/
public static double mindis(double[] Pfrom,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
{
return -1;
}
double res = Math.abs(Pfrom[0]-Pto[0]);
for(int i=1;i<Lfrom;i++){
double temp = Math.abs(Pfrom[i]-Pto[i]);
if (temp < res )
{
res =  temp;
}
}
return res;
}

/*
*Desc : get the max value of two array's corresponding element's distance
*Inpute:
*   @param  double[] Pfrom
*   @parma  double[] Pto
*Outpute:
*   @return double res
*Logic:
*/
public static double maxdis(double[] Pfrom,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
{
return -1;
}
double res = Math.abs(Pfrom[0]-Pto[0]);
for(int i=1;i<Lfrom;i++){
double temp = Math.abs(Pfrom[i]-Pto[i]);
if (temp > res )
{
res =  temp;
}
}
return res;
}

/*
*Desc : get the total sum value of two array's corresponding element's distance
*Inpute:
*   @param  double[] Pfrom
*   @parma  double[] Pto
*Outpute:
*   @return double res
*Logic:
*/
public static double sumdis(double[] Pfrom,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
{
return -1;
}
double res = Math.abs(Pfrom[0]-Pto[0]);
for(int i=1;i<Lfrom;i++){
double temp = Math.abs(Pfrom[i]-Pto[i]);
res += temp;
}
return res;
}

/*
*Desc : get the averagedistance of two array's corresponding element's distance
*Inpute:
*   @param  double[] Pfrom
*   @parma  double[] Pto
*Outpute:
*   @return double res
*Logic:
*/
public static double dimaddavg(double[] Pfrom,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
{
return -1;
}
double res = Math.abs(Pfrom[0]-Pto[0]);
for(int i=1;i<Lfrom;i++){
double temp = Math.abs(Pfrom[i]-Pto[i]);
res += temp;
}
return res/Lfrom;
}

/*
*Desc : get the diemnsional id  of two array's corresponding element's distance
*       who has tha max value
*Inpute:
*   @param  double[] Pfrom
*   @parma  double[] Pto
*Outpute:
*   @return Int  id
*Logic:
*/
public static int maxerrordim(double[] Pfrom,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
{
return -1;
}
double maxerror=Math.abs(Pfrom[0]-Pto[0]);
int res = 0;
for(int i=1;i<Lfrom;i++){
double temp = Math.abs(Pfrom[i]-Pto[i]);
if ( temp>maxerror)
{
res = i;
maxerror=temp;
}
}
return res;
}

}

/***********************************************************************
* @author leitw
*
*
*
*
**********************************************************************/

/***********************************************************************
* This class was write for Line sequation group,The line sequation was used
* widely ,such as regression analysic,linear programming in operational management
* and so on.
* This class offered some basic method for solute line sequations,
* the first step offered guass method
* ****************************************
b093
*******************************/

/************************************************************************
*线性方程组,根据AX=B,A为系数矩阵,B为常数矩阵
*线性方程求解法则参见线性代数
*
*****************************************************************/
public class Sequation
{
public Sequation()
{
}

/***********************************************************************
*简要说明:全选主元高斯消元法
*功能:解线性方程组
*输入参数:
*    @param  double[][]  coeffA  系数矩阵 n×n
*    @param  double[][]  constB  常向量  线性方程组的右端 n×m
*    @param  double[][]  resultX 返回线性方程组的解   n×m
*    @param  int    n  矩阵coeffA的阶数
*    @param  int    m  矩阵const的列数
*输出参数:
*    @return double abs  矩阵coeffA的行列式,如果abs=0,比较复杂,本函数不作处理,认为没有希望得到的解当|coeffA|=0,可能有无穷多解
**********************************************************************/
public static  double  guassEquation(double[][] coeffA ,double[][] constB, double[][] resultX , int n , int m )
{
int i,j,k,row,line;
double temp,max,abs=1;
/*
*change用于记载系数矩阵列交换的信息
*/
int[] change = new int
;
for(i=0;i<n;i++) change[i]=i ;
/*
*从矩阵的第一行开始
*a、找主元
*b、行列互换
*c、线性变换
*/
for(i=0;i<n-1;i++)
{
/*
*找主元
*/
row=i;line=i; max = Math.abs(coeffA[i][i]);
for(j=i;j<n;j++)
{
for(k=i;k<n;k++)
{
temp = Math.abs(coeffA[j][k]);
if(temp>max)
{
max = temp;
row = j;
line = k;

}

}
}
/*
*主元找到了为第row行,第line列,值为max
*如果max=0 ,表示行列式为0,返回0,退出
*/
if(max==0)
{
return 0;
}
/*
*第二步,行列互换,准备先行变换
*/
if(row != i)
{
for(k=i;k<n;k++)
{
temp = coeffA[i][k];
coeffA[i][k] = coeffA[row][k];
coeffA[row][k] = temp ;
}
for(k=0;k<m;k++)
{
temp=constB[i][k];
constB[i][k]=constB[row][k];
constB[row][k]=temp;
}
}

if(line != i)
{
for(j=0;j<n;j++)
{
temp = coeffA[j][line];
coeffA[j][line]= coeffA[j][i];
coeffA[j][i]=temp;
}
/*
*记载变量位置的变化(列变换信息标识了变量位置的变化信息)
*/
k=change[i];
change[i]=change[line];
change[line]= k;
}

/*
*开始线性变换,先对第i行归一化,然后对余行线性变换
*/
abs *=coeffA[i][i];
for(k=i+1;k<n;k++) coeffA[i][k]/=coeffA[i][i];
for(k=0;k<m;k++) constB[i][k] /= coeffA[i][i];
coeffA[i][i]=1;

/*
*余矩阵变换
*/
for(j=i+1;j<n;j++)
{
for(k=i+1;k<n;k++) coeffA[j][k] -= coeffA[j][i]*coeffA[i][k];
for(k=0;k<m;k++) constB[j][k] -= coeffA[j][i]*constB[i][k];
coeffA[j][i] =0 ;

}

}
abs *= coeffA[n-1][n-1];

/*
*回代消元
*/
for(k=0;k<m;k++)
{
constB[n-1][k] /= coeffA[n-1][n-1];
for(i=n-2;i>=0;i--)
for(j=i+1;j<n;j++)
constB[i][k]-=coeffA[i][j]*constB[j][k];
}

/*
*根据change,调整变量顺序,得最后解
*/
for(i=0;i<n;i++)
{
for(j=0;j<m;j++)
{
resultX[change[i]][j]=constB[i][j];
}
}
return abs ;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: