;
DO NOT RUN THIS AGAIN AS IT KEEPS ITERATING!!!!!!!
DO:
.run obs_mslp_as
.run obs_mslp_as_infill
.run obs_mslp_as_infill_iter2
.run obs_mslp_as_infill_iter2
THEN STOP!!!!!
;
; Infills missing Apr-Sep MSLP values by using PCR-based method.
; One difficulty is that some EOFs of the hemispheric SLP have dominance
; in particular regions, but not in others.  To infill some points, therefore,
; some EOFs are of little use.  We could simply increase the number of EOFs
; to retain, to make sure some are kept that have power in each location, but
; this raises the chance of overfitting.  So, what I now do is to perform
; four different PCAs on (1) 0-180, (2) 90-270, (3) 180-360, and (4) 270-90
; portions of the SLP data, and then use the closest region to each point:
; for 45-134 use (1), for 135-224 use (2), for 225-314 use (3), and for
; 315-44 use (4).
;
minr=0.55     ; minimum validated correlation acceptable for infilling
              ; 50%=0.71 40%=0.63 30%=0.55
;
; Restore Apr-Sep MSLP gridded dataset
;
restore,filename='obs_mslp_as.idlsave'
;  timey,fdseas,xlon,ylat,nyr,nx,ny,fdltm,fdsd,missfrac
kkk=missfrac
;
; Building of the PCR model requires some series with complete data.
; If we use all grid boxes, we will be limited to using 30% of the boxes,
; because 70% have at least some years missing.  If we omit the 15 years
; with least coverage, then 69% of the boxes will have complete coverage.
; We can then use the PCR model to infill values as best as we can for
; the subset of years used, but must use some other method for the 15 omitted
; years.
; FOR HARSH DATA REMOVAL, 73 years that have 82% of the boxes complete.
;
; Select the years to analyse
;
ksub=where(missfrac le 0.19,nsub)
if nsub ne 73 then message,'Ooops!'
;
fdsub=fdseas(*,*,ksub)
yrsub=timey(ksub)
;
; Perform PCA on subsetted data; automatically uses only those with complete
; data
;
nregion=5  ; 4 overlapping regions for different PCA, plus region 0 is ALL
usemode=[8,3,4,4,5]     ; use only 5 modes now we've got less data
nretain=8  ; number of modes to use, a balance between overfitting and fitting
cormat=0   ; 0=covariance PCA, 1=correlation PCA
;
for iregion = 0 , nregion-1 do begin
  case iregion of
    0: fd1=fdsub
    1: begin
      keepx=where(xlon le 175)
      fd1=fdsub(keepx,*,*)
      end
    2: begin
      keepx=where((xlon ge 90) and (xlon le 265))
      fd1=fdsub(keepx,*,*)
      end
    3: begin
      keepx=where(xlon ge 180)
      fd1=fdsub(keepx,*,*)
      end
    4: begin
      keepx=where((xlon ge 270) or (xlon le 85))
      fd1=fdsub(keepx,*,*)
      end
  endcase
  ;
  myeof2d,fd1,ev1,ea1,lam,lampct,lamcum,nretain=nretain,correlation=cormat
  ;
  if iregion eq 0 then begin
    ev=ev1
    ea=fltarr(nsub,nretain,nregion)
  endif
  ea(*,*,iregion)=ea1(*,*)
endfor
;
; Make a field to contain status of each box (0=unfilled 1=infilled 2=complete)
;
fdstatus=fltarr(nx,ny)
fddum=reform(ev(*,*,0))
kcomp=where(finite(fddum))
kmiss=where(finite(fddum) eq 0,nmiss)
fdstatus(kcomp)=2.
;
; For each box with some missing data, try to infill with PCR
;
print,'Number to infill:',nmiss
for imiss = 0 , nmiss-1 do begin
  print,imiss,format='($,I5)'
  ;
  ; Extract MSLP series and check for sufficient non-missing data
  ;
  locx=kmiss(imiss) mod nx
xxx=xlon(locx)
if (xxx ge 45) and (xxx le 134) then iregion=1
if (xxx ge 135) and (xxx le 224) then iregion=2
if (xxx ge 225) and (xxx le 314) then iregion=3
if (xxx ge 315) or (xxx le 44) then iregion=4
print,xxx,iregion,format='($,I4,I2)'
nnn=usemode(iregion)
  locy=fix(kmiss(imiss)/nx)
  onets=reform(fdsub(locx,locy,*))
  kgot=where(finite(onets),ngot)
  kwant=where(finite(onets) eq 0,nwant)
  ;
  ; Because of the need for separate testing/training sets, need at least 30
  ; years of data
  ;
  if ngot ge 30 then begin
    ;
    ; Split data into two equal segments
    ;
    setlen1=fix(ngot/2)
    setlen2=ngot-setlen1
    kset1=kgot(0:setlen1-1)
    kset2=kgot(setlen1:ngot-1)
    ;
    for iter = 0 , 2 do begin   ; repeat for test,training & vice versa & full
      case iter of
        0: begin
          ktrain=kset1
          ktest=kset2
        end
        1: begin
          rmean=rskill
          ktrain=kset2
          ktest=kset1
        end
        2: begin
          rmean=(rmean+rskill)*0.5
          print,rmean,format='($,F7.2)'
          ktrain=kgot
          ktest=kwant ; this is in fact the predictions we want, cannot test it
        end
      endcase
      ;
      ; Build PCR (PC-based multiple linear regression)
      ;
      depts=onets(ktrain)
      indts=transpose(ea(ktrain,0:nnn-1,iregion))
      wts=fltarr(n_elements(ktrain))+1.
      pcrcoeff=regress(indts,depts,wts,yfit,pcrconst,pcrsigma,pcrf,lincorr,$
        mulcorr,pcrchi,pcrstat,/relative_weight)
      ;
      ; If regression failed, set skill to zero, else evaluate skill
      ;
      if pcrstat ne 0 then begin
        rskill=-9.99
      endif else begin
        depts=onets(ktest)
        indts=transpose(ea(ktest,0:nnn-1,iregion))
        nval=n_elements(ktest)
        prets=fltarr(nval)
        for ival = 0 , nval-1 do begin
          prets(ival)=pcrconst+total(pcrcoeff*indts(*,ival))
        endfor
        rskill=correlate(depts,prets)
      endelse
      print,mulcorr,rskill,format='($,2F7.2)'
      ;
      ; Determine whether skill is high enough, and store reconstruction
      ; if it is
      ;
      if iter eq 2 then begin
        if rmean ge minr then begin
          fdstatus(locx,locy)=1.
          print,'  INFILLED'
          fdsub(locx,locy,kwant)=prets(*)
        endif else begin
          print,'  FAILED'
        endelse
      endif
      ;
    endfor
    ;
  endif else begin
    print,'  TOO SHORT:',ngot
  endelse
  ;
endfor
;
dummy=where(fdstatus eq 1,ninfill)
print,'Total infilled = ',ninfill
;
; Plot layout of missing, infilled and complete data boxes
;
loadct,39
multi_plot,nrow=2
def_1color,10,color='white'
def_1color,11,color='black'
def_1color,12,color='mgrey'
map=def_map(/npolar) & map.limit(0)=15.
coast=def_coast(/get_device) & coast.double=0
sm=def_sm(/off) & sm.cyclic=0
inter_boxfd,fdstatus,xlon,ylat,map=map,$
  coast=coast,sm=sm,$
  levels=[-0.5,0.5,1.5,2.5],c_colors=[10,11,12]
;
; Put infilled subset back into full dataset
;
fdseas(*,*,ksub)=fdsub(*,*,*)
;
; Also produce a year-by-year timeseries of the fraction of missing data
;
nall=total(finite(fdltm))
nmiss=float(total(finite(fdseas),1))
nmiss=1.-(total(nmiss,1)/nall)
missfrac=nmiss
;
plot,timey,nmiss,psym=10,/xstyle,xtitle='Year',$
  ytitle='Fraction of data missing per summer',yrange=[0,0.5],/ystyle
oplot,timey,kkk,thick=3,psym=10
;
; Now save the results in the same 'format' as the uninfilled data, so they
; can be switched at ease
;
save,filename='obs_mslp_as_infill.idlsave',$
  timey,fdseas,xlon,ylat,nyr,nx,ny,fdltm,fdsd,missfrac,fdstatus
;
end
