package  com.lc.v3.scm.datamining.math;

* @author

* Desc: The regression analyse is a widely used statistices  method  ,It was always
* used for forecast,cause and effect analyse  and so on.
* --统计分析-〉回归分析,一般分为线形回归和非线性回归
* All of us familiar with the line regression.In fact,this method can extend to non
* linear regression ,But the model built based on the analyser's private experience
* and his knowledge of practice.
* ************************************************************************************/

public  class  Regression
public Regression()


*     k0,k1,....,kn为回归系数,该函数就是要根据一组样本(Xi1,Xi2,...Xin,Yi)i=1,2,...,m[m个样本],根据最小二乘法得原则,计算出
*     最佳得回归分析系数k0,k1,....,kn,从而得到线性回归分析模型,该模型稍加扩展,就可以推广到非线性回归模型
*    @param  double[][]  X  自变量样本集
*    @param  double[]    Y  变量结果集
*    @param  double[]   K  回归系数
*    @param  int  n  回归变量个数
*    @param  int  m  样本个数
*    @return double result  0:失败,其他:成功
public static double  LineRegression(
double[][] X,
double[] Y,
double[] K,
int n,
int m
double result = 0 ;

int  XLen = n+1;
int  YLen = 1;
int i,j,k;
double[][]  coeffX = new double[XLen][XLen];
double[][]  constY = new double[XLen][1];
double[][]  resultK = new double[XLen][1];

double[][] temp = new double[m+1][n+1];
for(i =0;i<n+1;i++)
temp[0][i] = 1;
for(i =0;i<m+1;i++)
temp[i][0] = 1;
for( i=1;i<m+1;i++)
for( j=1;j<n+1;j++)
temp[i][j]= X[i-1][j-1];
for( j= i;j<n+1;j++){
double col = 0 ;
col+= ( temp[k][i]*temp[k][j] );
coeffX[i][j] = col;
coeffX[j][i] = col;

double conTemp =0 ;
conTemp+= ( Y[k-1]*temp[k][i]);


result = Sequation.guassEquation(coeffX,constY,resultK,XLen,1);
if(result ==0 )
//System.out.println("The regression is failed,please check the sample point \n");
return result;
for(i= 0;i<n+1;i++)
K[i] = resultK[i][0];
return result;

*         找那种对模型的拟合度最差的样本点,然后去掉该样本点再进行回归运算.本模型用实际值与模型预测值之间的平均偏差作为度量模型准确性的效
*         用函数。
*         如果平均偏差减少,标识模型在去掉噪声数据后模型拟合的效果增加,否则停止继续优化的步骤。另外,无论如何优化,都必须至少保留样本保有率
*         所确定的最少样本个数
*    @param  double[][]  X  自变量样本集
*    @param  double[]    Y  变量结果集
*    @param  int  n  回归变量个数
*    @param  double[]   K  最终回归系数(返回)
*    @param  int  m  样本个数
*    @param   double retainRate  样本最低保有率
*    @param  int[] LossPoint 被丢弃的样本点(返回)
*    @return double res  -1:失败,1:成功
public static  double optLineRegression( double[][] X,
double[] Y,
double[] K,
int n,
int m,
double retainRate ,
int[] LossPoint){

double res = -1;
//System.out.println("The parameter is not normal ,please check it\n");
return res;
//System.out.println("The retain parameter is not in 0 and 1 \n");
return res;


Double Dtemp = new Double(m*retainRate);
int minsample = Dtemp.intValue();


int lossnum =0 ;

int[] LossPointTemp = new int[m];


double temp = LineRegression(X,Y, K,n,m);
if(temp == 0){
//System.out.println("The regression operation is failed\n");
return res;


double ErrorStd = avgerror(X, Y, K,n, m);

double[][] SampleX = X;
double[]   SampleY = Y;
double[]    CoeffK = K;
int SampleNum = m;

int[] change = new int[m];
for(int k=0;k<m;k++)
int index_max = -1;

for (int i=m;i>minsample ;i-- )
index_max = maxErrorIndex(SampleX,SampleY,CoeffK,n,SampleNum);

if(index_max == -1)
return -1;


int Loss = change[index_max];
lossnum +=1;
SampleNum -=1;

double[][] SampleXTemp = SampleX;
SampleX = new double[SampleN

double[] SampleYTemp = SampleY;
SampleY  = new double[SampleNum];

for(int j= 0;j<index_max;j++){
for(int k=0;k<n;k++)

for(int j= index_max;j<SampleNum;j++){
for(int k=0;k<n;k++){
SampleY[j] = SampleYTemp[j+1];
change[j] = change[j+1];




double  ErrorOpt = avgerror(SampleX,SampleY,CoeffK,n,SampleNum);

if (ErrorOpt>=ErrorStd)//优化过程没有任何改良
return  1;

LossPoint[m-i] = Loss;


return 1;

*    @param  double[][]  X  自变量样本集
*    @param  double[]    Y  变量结果集
*    @param  int  n  回归变量个数
*    @param  double[]   K  回归系数
*    @param  int  m  样本个数
*    @return double Ierror :-1失败 平均误差
public static double avgerror(double[][] X,double[] Y,double[] K,int n,int m){
double res = -1;
double[] YF = new double[m];
for(int i=0;i<m;i++){
YF[i] = K[0];
for(int j=0;j<n;j++)

res = Base.dimaddavg(Y,YF);
return res;

*    @param  double[][]  X  自变量样本集
*    @param  double[]    Y  变量结果集
*    @param  int  n  回归变量个数
*    @param  double[]   K  回归系数
*    @param  int  m  样本个数
*    @return int Index_id :-1失败
public static int maxErrorIndex(double[][] X,double[] Y,double[] K,int n,int m){
int index_id = -1;
double[] YF = new double[m];
for(int i=0;i<m;i++){
YF[i] = K[0];
for(int j=0;j<n;j++)

index_id = Base.maxerrordim(Y,YF);
return index_id;


package com.lc.v3.scm.datamining.math;

public class  Base
public Base(){

*Desc:get the max value for a  nonmultidimensional array
public static double max( double[] sample){
int n = sample.length;
double result =sample[0];
for (int i=1;i<n ;i++ )
if (sample[i]>result)
return result;

*Desc:get the min value for a nonmultidimensional array
public static double min( double[] sample){
int n = sample.length;
double result =sample[0];
for (int i=1;i<n ;i++ )
if (sample[i]<result)
return result;

*Desc:get the average value  for a nonmultidimensional array
public static double avg( double[] sample){
int n = sample.length;
double result =sample[0];
for (int i=1;i<n ;i++ )
result +=sample[i];
return result/n;

*Desc: get the variance (方差)for a nonmultidimensional array
* (Inpute)输入参数:
*     @param double[] sample 样本数组
* (OutPute)输出参数:
*     @return double if ==-1,样本为空
* (Logic)计算逻辑:
*     sample - 平均值的平方求和
public static double val( double[] sample){
int n = sample.length;
double result = -1;
if (n<=0)
return result ;
double avg = avg(sample);
result = 0;
for(int j=0;j<n;j++)
result += (sample[j]-avg)*(sample[j]-avg);

return result;

*Desc: get the standard variance(标准方差) for a nonmultidimensional array
* (Inpute)输入参数:
*     @param double[] sample 样本数组
* (Output)输出参数:
*     @return double if ==-1,样本为空
* (Logic)计算逻辑:
*     sample[]的方差开方
public static double stdval (double[] sample){
int n = sample.length;
double result = -1;
return result;
}else {
result = val(sample);
result = Math.sqrt(result);
return result;

*Desc: get the max distance  for a nonmultidimensional array
* (Inpute)输入参数:
*     @param double[] sample 样本数组
* (Outpute)输出参数:
*     @return double if ==-1,样本为空
* (Logic)计算逻辑:
*     The array of sample 's max value subtract Its
*     min value [sample[]的最大值-最小值]
public static double maxdis (double[] sample){
int n = sample.length;
double result = -1;
return result;
}else {
result = max(sample)-min(sample);
return result;

*Desc: get the average warp for a nonmultidimensional array
* (Inpute)输入参数:
*     @param double[] sample 样本数组
* (Outpute)输出参数:
*     @return double if ==-1,样本为空
* (Logic)计算逻辑:
*     sample[]的每一个元素-平均值的绝对值的平均值
public static double  meddev (double[] sample){
int n = sample.length;
double result = -1;
return result;
}else {
double avg = avg(sample);
result =0;
for(int j=0;j<n;j++){
result += Math.abs(sample[j]-avg);
result /=n;
return result;

*Desc: get the inside multiplication of  two array
*     @param double[] sample1
*     @param double[] sample2
*     @return double res;
*   Sum the value of two array's corresponding element production [对应元素乘积的和]

public static double inmul(double[] sample1,double[] sample2){
double result = 1.0/0.0;
int len1 = sample1.length;
int len2 = sample2.length;
if(len1 != len2){
System.out.println("two vector dose not at the same dimension,please check it\n");
return result ;
result = 0;
for (int i=0;i<len1 ;i++ )
result += sample1[i]*sample2[i];

return result;

*Desc : get the opposite production of two array to comprise a new array
*     @param double[] sample1
*     @param double[] sample2
*     @return double[] result;
*   形成新的向量,元素为对应元素的乘积

public static double[] vecmul(double[] sample1,double[] sample2){
int len1 = sample1.length;
int len2 = sample2.length;
double[] result = new double[len1];
if(len1 != len2){
System.out.println("two vector dose not at the same dimension,please check it\n");
return result ;
for (int i=0;i<len1 ;i++ )
result[i]= sample1[i]*sample2[i];

return result;

*Desc : sort the element of the array into a from  great to little order
public  void  decorder(double[] sample){
int n = sample.length;
for(int i=0;i<n-1;i++){
for(int j=i+1;j<n;j++){
if (sample[i]<sample[j])
double temp = sample[i];

*Desc : sort the element of the array into a from  little to great order
public  void  ascorder(double[] sample){
int n = sample.length;
for(int i=0;i<n-1;i++){
for(int j=i+1;j<n;j++){
if (sample[i]>sample[j])
double temp = sample[i];

*Desc: get two point distance of standard (Euclid distance)
*     @param  double[] Pfrom
*     @param  double[] Pto
* Outpute:
*     @param  double res
* Logic:
public static double eudis(double[] Pfrom ,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
return -1;
double res = 0;
for(int i=0;i<Lfrom;i++){
double temp = Pfrom[i]-Pto[i];
return Math.sqrt(res);

*Desc : get the min value of two array's corresponding element's distance
*    @param  double[] Pfrom
*    @parma  double[] Pto
*    @return double res
public static double mindis(double[] Pfrom,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
return -1;
double res = Math.abs(Pfrom[0]-Pto[0]);
for(int i=1;i<Lfrom;i++){
double temp = Math.abs(Pfrom[i]-Pto[i]);
if (temp < res )
res =  temp;
return res;

*Desc : get the max value of two array's corresponding element's distance
*   @param  double[] Pfrom
*   @parma  double[] Pto
*   @return double res
public static double maxdis(double[] Pfrom,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
return -1;
double res = Math.abs(Pfrom[0]-Pto[0]);
for(int i=1;i<Lfrom;i++){
double temp = Math.abs(Pfrom[i]-Pto[i]);
if (temp > res )
res =  temp;
return res;

*Desc : get the total sum value of two array's corresponding element's distance
*   @param  double[] Pfrom
*   @parma  double[] Pto
*   @return double res
public static double sumdis(double[] Pfrom,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
return -1;
double res = Math.abs(Pfrom[0]-Pto[0]);
for(int i=1;i<Lfrom;i++){
double temp = Math.abs(Pfrom[i]-Pto[i]);
res += temp;
return res;

*Desc : get the averagedistance of two array's corresponding element's distance
*   @param  double[] Pfrom
*   @parma  double[] Pto
*   @return double res
public static double dimaddavg(double[] Pfrom,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
return -1;
double res = Math.abs(Pfrom[0]-Pto[0]);
for(int i=1;i<Lfrom;i++){
double temp = Math.abs(Pfrom[i]-Pto[i]);
res += temp;
return res/Lfrom;

*Desc : get the diemnsional id  of two array's corresponding element's distance
*       who has tha max value
*   @param  double[] Pfrom
*   @parma  double[] Pto
*   @return Int  id
public static int maxerrordim(double[] Pfrom,double[] Pto){
int Lfrom = Pfrom.length;
int Lto = Pto.length;
if (Lfrom != Lto)
return -1;
double maxerror=Math.abs(Pfrom[0]-Pto[0]);
int res = 0;
for(int i=1;i<Lfrom;i++){
double temp = Math.abs(Pfrom[i]-Pto[i]);
if ( temp>maxerror)
res = i;
return res;


* @author leitw

* This class was write for Line sequation group,The line sequation was used
* widely ,such as regression analysic,linear programming in operational management
* and so on.
* This class offered some basic method for solute line sequations,
* the first step offered guass method
* ****************************************

public class Sequation
public Sequation()

*    @param  double[][]  coeffA  系数矩阵 n×n
*    @param  double[][]  constB  常向量  线性方程组的右端 n×m
*    @param  double[][]  resultX 返回线性方程组的解   n×m
*    @param  int    n  矩阵coeffA的阶数
*    @param  int    m  矩阵const的列数
*    @return double abs  矩阵coeffA的行列式,如果abs=0,比较复杂,本函数不作处理,认为没有希望得到的解当|coeffA|=0,可能有无穷多解
public static  double  guassEquation(double[][] coeffA ,double[][] constB, double[][] resultX , int n , int m )
int i,j,k,row,line;
double temp,max,abs=1;
int[] change = new int
for(i=0;i<n;i++) change[i]=i ;
row=i;line=i; max = Math.abs(coeffA[i][i]);
temp = Math.abs(coeffA[j][k]);
max = temp;
row = j;
line = k;


*如果max=0 ,表示行列式为0,返回0,退出
return 0;
if(row != i)
temp = coeffA[i][k];
coeffA[i][k] = coeffA[row][k];
coeffA[row][k] = temp ;

if(line != i)
temp = coeffA[j][line];
coeffA[j][line]= coeffA[j][i];
change[line]= k;

abs *=coeffA[i][i];
for(k=i+1;k<n;k++) coeffA[i][k]/=coeffA[i][i];
for(k=0;k<m;k++) constB[i][k] /= coeffA[i][i];

for(k=i+1;k<n;k++) coeffA[j][k] -= coeffA[j][i]*coeffA[i][k];
for(k=0;k<m;k++) constB[j][k] -= coeffA[j][i]*constB[i][k];
coeffA[j][i] =0 ;


abs *= coeffA[n-1][n-1];

constB[n-1][k] /= coeffA[n-1][n-1];

return abs ;
