void configureTMR3Timers(){
  TMR3_ENBL=0;//disable timers
  //TMR3.0 triggers FlexIO2 to output pixel data, once per pixel (two brightness sub-steps per pixel are handled by FlexIO)
  //TMR3_CTRL0=TMR_CTRL_CM(1) | TMR_CTRL_PCS(15) | TMR_CTRL_OUTMODE(3); //count rising edges of IP/128 (1171875 Hz), count up continuously with roll-over, toggle output on compare
  TMR3_CTRL0=TMR_CTRL_CM(2) | TMR_CTRL_PCS(7) | TMR_CTRL_OUTMODE(3); //count rising+falling edges of TMR3.3 output, count up continuously with roll-over, toggle output on compare
  TMR3_CNTR0=0;
  TMR3_COMP10=0xFFFF;//count up to this then toggle output to trigger FlexIO2 and keep counting
  TMR3_CMPLD10=0xFFFF;//load this value to COMP10 upon matching COMP10, needed for DMA usage
  TMR3_LOAD0=0;
  //TMR3_SCTRL0=TMR_SCTRL_TCFIE | TMR_SCTRL_OEN;//enable compare interrupt and capture interrupt, capture time value on falling edges of input, enable output on qtimer3.0 pin
  TMR3_SCTRL0=TMR_SCTRL_FORCE | TMR_SCTRL_OEN;//force a 0 output for consistent initialization, don't yet enable compare interrupt and capture interrupt, capture time value on falling edges of input, enable output on qtimer3.0 pin
  //TMR3_CSCTRL0=TMR_CSCTRL_TCF1EN;//enable compare1 interrupt, do not use preload functionality for this test
  TMR3_CSCTRL0=TMR_CSCTRL_CL1(1);//enable loading CMPLD10->COMP10 upon matching COMP10
  //TMR3_DMA0=TMR_DMA_CMPLD1DE;//enable DMA request upon using CMPLD10
  
  //TMR3.1 input pin gets divided opto edge counts, on capture edge a time synchronized to 3.0 may be read out in interrupt
  //TMR3_CTRL1=TMR_CTRL_CM(1) | TMR_CTRL_PCS(15) | TMR_CTRL_SCS(1); //count rising edges of IP/128, count up continuously with roll-over
  TMR3_CTRL1=TMR_CTRL_CM(2) | TMR_CTRL_PCS(7) | TMR_CTRL_SCS(1); //count rising+falling edges of TMR3.3 output, count up continuously with roll-over
  TMR3_CNTR1=0;
  TMR3_COMP11=0xFFFF;
  TMR3_CMPLD11=0xFFFF;
  TMR3_LOAD1=0;
  TMR3_SCTRL1=TMR_SCTRL_CAPTURE_MODE(3);//enable edge capture on rising + falling edge (which is every opt_edges_every rising edge only of opto input); | TMR_SCTRL_IEFIE to enable interrupt
  TMR3_CSCTRL1=0;//no preload
  
  //TMR3.2 used to divide opto edge inputs from pin 3.3, this is routed to input pin 3.1 through XBAR
  TMR3_CTRL2=TMR_CTRL_CM(1) | TMR_CTRL_PCS(3) | TMR_CTRL_LENGTH | TMR_CTRL_OUTMODE(3); //count rising edges of pin 3.3, count up continuously with reload on compare, toggle output on compare
  TMR3_CNTR2=0;
  TMR3_COMP12=0;//opt_edges_every-1;//count up to this then toggle output for testing
  TMR3_CMPLD12=0;//opt_edges_every-1;//has to be set despite requesting no preload
  TMR3_LOAD2=0;
  TMR3_SCTRL2=TMR_SCTRL_IPS | TMR_SCTRL_OEN;//output divided counts of input falling edge on 3.2 pin (by toggling rather than rising edge)
  TMR3_CSCTRL2=0;//no preload

  //TMR3.3 used to slow down the 150 MHz input clock feeding TMR3.1 and TMR3.0 for better granularity than provided by dividers
  TMR3_CTRL3=TMR_CTRL_CM(1) | TMR_CTRL_PCS(12) | TMR_CTRL_LENGTH | TMR_CTRL_OUTMODE(3); //count rising edges of IP/16, count up continuously with reload on compare, toggle output on compare
  TMR3_CNTR3=0;
  //TMR3_COMP13=12;//count up to this then toggle output to make a slower clock
  //TMR3_CMPLD13=12;//need this even without asking for preload
  TMR3_COMP13=tmr3_slow_div;
  TMR3_CMPLD13=tmr3_slow_div;
  TMR3_LOAD3=0;
  //TMR3_SCTRL3=TMR_SCTRL_OEN;//enable internal output pin
  //TMR3_CSCTRL3=0;//do not use preload functionality for this test
}
  
void setupQTimer(){
  //enable clock to timer
  CCM_CCGR6 |= CCM_CCGR6_QTIMER3(CCM_CCGR_ON);
  //Default timer frequency is 150 MHz
  //set up input pin alt function
  IOMUXC_QTIMER3_TIMER3_SELECT_INPUT=1; //map QTimer3.3 daisy chain to AD_B1_03
  IOMUXC_SW_MUX_CTL_PAD_GPIO_AD_B1_03=0x11;//set to 1 for Teensy pin 15 -> QTimer3.Timer3 and force input path
  IOMUXC_SW_PAD_CTL_PAD_GPIO_AD_B1_03=0x10000;//enable pad hysteresis, no pulldown, slow speed 50 MHz

  //pinMode(14, OUTPUT);//output timer 3.2 for testing
  //IOMUXC_QTIMER3_TIMER2_SELECT_INPUT=1; //map QTimer3.2 daisy chain to AD_B1_02
  //IOMUXC_SW_MUX_CTL_PAD_GPIO_AD_B1_02=0x1;//set to 1 for Teensy pin 14 -> QTimer3.Timer2
  
  //use XBAR to connect qtimer3.0 output to FlexIO2 trigger
  //QTIMER3_TIMER0 => XBAR1_IN32
  //XBAR1_OUT129 => FLEXIO2_TRIGGER_IN0
  //QTIMER3_TIMER2 => XBAR1_IN34
  //XBAR1_OUT97 => QTIMER3_TIMER3 input, with IOMUXC_GPR_GPR6[QTIMER3_TRM3_INPUT_SEL]
  //XBAR1_OUT95 => QTIMER3_TIMER1 input, with IOMUXC_GPR_GPR6[QTIMER3_TRM1_INPUT_SEL]
  
  CCM_CCGR2 |= CCM_CCGR2_XBAR1(3);//Enable XBAR1 clock
  delayMicroseconds(1);//wait for modules to initialize
  XBARA1_SEL64&=0x00FF;
  XBARA1_SEL64|=0x2000;//connect OUT129 to IN32 to enable FlexIO trigger with timer 3.0

  //XBARA1_SEL48&=0x00FF;
  //XBARA1_SEL48|=0x2200;//connect OUT97 to IN34 for testing timer 3.3 input
  //IOMUXC_GPR_GPR6|=IOMUXC_GPR_GPR6_QTIMER3_TRM3_INPUT_SEL;//use XBAR input for timer 3.3

  XBARA1_SEL47&=0x00FF;
  XBARA1_SEL47|=0x2200;//connect OUT95 to IN34 to enable divided edge counting from timer 3.2 with timer 3.1 input pin
  IOMUXC_GPR_GPR6|=IOMUXC_GPR_GPR6_QTIMER3_TRM1_INPUT_SEL;//use XBAR input for timer 3.1
  
  configureTMR3Timers();
  
  //pinMode(19, OUTPUT);//output timer3.0 pin 
  //digitalWrite(19, LOW);
  //IOMUXC_SW_MUX_CTL_PAD_GPIO_AD_B1_00=1;//set to 1 for Teensy pin 19 <- QTimer3.Timer0
  attachInterruptVector(IRQ_QTIMER3, qosedge);
  NVIC_ENABLE_IRQ(IRQ_QTIMER3);
  //NVIC_SET_PRIORITY(IRQ_TIMER3, 32); //powers of 2, lower value = higher priority
  //TMR3_ENBL=1;//enable timer3.0

  //Set up DMA Channel:
  tmrWriter.begin();
  tmrWriter.sourceCircular(timer_buffer, timer_buffer_bytes);//this sets BITER and CITER to define number of minor loop counts in major loop, as well as NUMBYTES=2 bytes transferred per minor loop
  tmrWriter.destination(TMR3_CMPLD10);//write 16-bit value to constant address
  tmrWriter.triggerAtHardwareEvent(DMAMUX_SOURCE_QTIMER3_WRITE0_CMPLD1);//triggers minor loop to transfer 2 bytes
  tmrWriter.attachInterrupt(preload_timer);
  tmrWriter.interruptAtCompletion();//interrupt called upon transferring entirety of timer_buffer set by CITER number of minor loop counts
  //tmrWriter.interruptAtHalf();//interrupt at half buffer, to extend the timing margin from 1 pixel to 16 pixels
  //when ready to start:
  //tmrWriter.enable(); //Note DMA channel should be enabled before enabling the hardware trigger, and disabled in reverse order
}

float get_motor_speed(){ //not for high performance use
  if(opt_state==0) { return NAN; }
  float opt_timer_freq=(150000000.0f/16)/(tmr3_slow_div+1);//721153.85 = 150 MHz / (16*13 divider), usable from 11.01 RPM and upward; previously 1171875=150MHz/128; ticks per second
  return opt_timer_freq/omega_est;
}

void qosedge(){
  //interrupt called after optical switch transition on pin QTimer3.timer3 has been captured into TMR3_CAPT0
  if((TMR3_SCTRL1&TMR_SCTRL_IEF)!=0){ //avoid double-triggering due to memory synchronization issues (or use asm volatile ("dsb"); instead)
    //optical edge captured
    //digitalWriteFast(2, HIGH);//for testing timing issues
    digitalToggleFast(1);//for testing timing issues
    uint32_t tel=opt1_el;
    opt1_el=0;
    if(opt_state!=0){
      if(tel>opt1_el_max){
        opt1_el_max=tel;
      }
      if(tel<opt1_el_min){
        opt1_el_min=tel;
      }
    }
    capture_timestamps[capture_ts_write]=TMR3_CAPT1;//write to buffer, ticks are at 1.171875 MHz
    TMR3_SCTRL1&=(~TMR_SCTRL_IEF);//clear interrupt flag and enable next capture to occur
    //TMR3_SCTRL3&=(~TMR_SCTRL_IEF);//clear interrupt flag and enable next capture to occur
    static uint32_t spinupcons=0;
    static int mx1=0, mx1n=0, mx2=0, mx2n=0, mn1=0, mn1n=0, mn2=0, mn2n=0, thrsh=0;
    static float omega_iir=0;
    uint32_t l1, l2, ri, n;
    uint16_t d1, d2;
    float df, dtheta;
    int d3;
    switch(opt_state){
      case 0://preload buffer
        omega_est=0;
        if(capture_ts_write>=opt_edges_circle){
          opt_state=1;
          spinupcons=0;
        }
        break;
      case 1://spin up, set spinupcons=0 before entering
        l1=capture_ts_write;
        l2=(capture_ts_write-1)%opt_edges_nbuf;
        d1=capture_timestamps[l1]-capture_timestamps[l2];
        omega_est=d1*opt_edges_circle;
        if(d1<(65535/opt_edges_circle) && d1>0){//minimum speed of ~18 Hz to allow for no wraparound subtraction over previous circle
          spinupcons++;
        }else{
          spinupcons=0;
        }
        if(spinupcons>(10*opt_edges_circle)){
          opt_state=2;
          spinupcons=0;
        }
        break;
      case 2://stabilize speed, set spinupcons=0 before entering
        l1=capture_ts_write;
        l2=(capture_ts_write-1)%opt_edges_nbuf;
        d1=capture_timestamps[l1]-capture_timestamps[l2];
        omega_est=d1*opt_edges_circle;
        if(d1>(65535/opt_edges_circle) || d1==0){//minimum speed of ~18 Hz to allow for no wraparound subtraction over previous circle
          opt_state=1;
          spinupcons=0;
          break;
        }
        if((capture_ts_write%(opt_edges_circle/4))==0){
          l1=capture_ts_write;
          l2=(capture_ts_write-opt_edges_circle)%opt_edges_nbuf;
          d1=capture_timestamps[l1]-capture_timestamps[l2];
          l1=(capture_ts_write-opt_edges_circle*2)%opt_edges_nbuf;
          d2=capture_timestamps[l2]-capture_timestamps[l1];
          if(d1>d2){
            d1=d1-d2;
          }else{
            d1=d2-d1;
          }
          if((d1==0)||((d2/d1)>=100)){//stable within 1%
            spinupcons++;
          }
          if(spinupcons>40){
            opt_state=3;
            spinupcons=0;
            mx1=0; mx1n=0; mx2=0; mx2n=0; mn1=0; mn1n=0; mn2=0; mn2n=0;
          }
        }
        break;
      case 3://identify index position, set spinupcons=0, mx1=0; mx1n=0; mx2=0; mx2n=0; mn1=0; mn1n=0; mn2=0; mn2n=0; before entering
        l1=capture_ts_write;
        l2=(capture_ts_write-1)%opt_edges_nbuf;
        d1=capture_timestamps[l1]-capture_timestamps[l2];
        omega_est=d1*opt_edges_circle;
        if(d1>(65535/opt_edges_circle) || d1==0){//minimum speed of ~18 Hz to allow for no wraparound subtraction over previous circle
          opt_state=1;
          spinupcons=0;
          break;
        }
        if(spinupcons<opt_edges_circle){
          //Convolve timestamps with [-1,2,-1]
          l1=(capture_ts_write-2)%opt_edges_nbuf;
          d2=capture_timestamps[l2]-capture_timestamps[l1];
          d3=(int)d2;
          d3-=d1;
          n=capture_ts_write%opt_edges_circle;
          //Find 1st and 2nd maxima and minima
          if(d3>mx1){
            mx2=mx1;
            mx2n=mx1n;
            mx1=d3;
            mx1n=n;
          }else{
            if(d3>mx2){
              mx2=d3;
              mx2n=n;
            }
          }
          if(d3<mn1){
            mn2=mn1;
            mn2n=mn1n;
            mn1=d3;
            mn1n=n;
          }else{
            if(d3<mn2){
              mn2=d3;
              mn2n=n;
            }
          }
          spinupcons++;
        }else{
          //Divide 1st max by 2nd max, 1st min by 2nd min. Nominally ratio 2.2 for main peak, 1.5 for reverse peak.
          //Check that one of the ratios is above 1.9 and the other below 1.8
          //Whether max or min is the larger ratio depends on rotation direction and orientation of encoder ring
          float rx=((float)mx1)/(float)mx2;
          float rn=((float)mn1)/(float)mn2;
          if((rx<1.8f)&&(rn>1.9f)){//correct orientation established
            //The index position is the position of the main convolved peak
            reference_index=mn1n;//The index position is the position of the main convolved peak
            //Confirm that this repeats every 512 samples, then alignment is complete
            opt_state=4;
            spinupcons=0;
            thrsh=(int)(mn2*1.9f);//threhsold for lock, note this definition is velocity-dependent so should not be used to verify lock later (a velocity-independent threshold can be defined for that)
          }else{
            opt_state=2;
            err_lock++;
            spinupcons=0;
          }
        }
        break;
      case 4: //align to reference_index and increase interrupt timer divider to reduce number of interrupts
        l1=capture_ts_write;
        l2=(capture_ts_write-1)%opt_edges_nbuf;
        d1=capture_timestamps[l1]-capture_timestamps[l2];
        omega_est=d1*opt_edges_circle;
        if(d1>(65535/opt_edges_circle) || d1==0){//minimum speed of ~18 Hz to allow for no wraparound subtraction over previous circle
          opt_state=1;
          spinupcons=0;
          break;
        }
        n=(capture_ts_write-reference_index)%opt_edges_circle;
        if(n==0){
          //slow down interrupt rate
          //TMR3_CNTR2=0;//not necessary
          TMR3_COMP12=opt_edges_every-1;//count up to this then toggle output for testing
          TMR3_CMPLD12=opt_edges_every-1;//has to be set despite requesting no preload
          capture_timestamps[0]=capture_timestamps[capture_ts_write];
          capture_ts_write=0;
          opt_state=5;
          spinupcons=0;
        }
        break;
      case 5: //preload some values at lower rate
        l1=capture_ts_write;
        l2=(capture_ts_write-1)%opt_edges_nbuf;
        d1=capture_timestamps[l1]-capture_timestamps[l2];
        omega_est=d1*opt_edges_thetas;
        if(d1>(65535/opt_edges_thetas) || d1==0){//minimum speed of ~18 Hz to allow for no wraparound subtraction over previous circle
          opt_state=9;
          spinupcons=0;
          break;
        }
        spinupcons++;
        if(spinupcons>(opt_edges_thetas)){
          opt_state=10;
          spinupcons=0;
        }
        break;
      case 9: //return back to fast interrupts
        TMR3_COMP12=0;//count up to this then toggle output for testing
        TMR3_CMPLD12=0;//has to be set despite requesting no preload
        TMR3_CNTR2=0;
        opt_state=1;
        spinupcons=0;
        break;
      case 10: //build theta map, set spinupcons=0 and slow interrupt rate and align zero to reference_index before entering
        l1=capture_ts_write;
        l2=(capture_ts_write-1)%opt_edges_nbuf;
        d1=capture_timestamps[l1]-capture_timestamps[l2];
        omega_est=d1*opt_edges_thetas;
        if(d1>(65535/opt_edges_thetas) || d1==0){//minimum speed of ~18 Hz to allow for no wraparound subtraction over previous circle
          opt_state=9;
          spinupcons=0;
          break;
        }
        //n=(capture_ts_write-(opt_edges_thetas/2))%opt_edges_thetas;
        //if((n%opt_edges_every)==0){
          l2=(capture_ts_write-(opt_edges_thetas))%opt_edges_nbuf;
          d1=capture_timestamps[l1]-capture_timestamps[l2];//circle speed
          if(d1==0){//avoid divide by zero error which would break the theta algorithm
            opt_state=9;
            spinupcons=0;
            break;
          }
          l1=(capture_ts_write-(opt_edges_thetas/2))%opt_edges_nbuf;
          l2=(l1+1)%opt_edges_nbuf;
          d2=capture_timestamps[l2]-capture_timestamps[l1];//one transition speed
          df=((float)d2)/(float)d1;//fraction of angle between transitions
          ri=(l1)%opt_edges_thetas;//index relative to reference
          //ri>>=opt_edges_dec;
          if(spinupcons>=opt_edges_thetas){
            if(spinupcons>=(2*opt_edges_thetas)){
              opt_state=15;
              spinupcons=0;
              omega_iir=(float)d1;//latest estimate of circle speed to seed IIR filter
            }else{
              thetas[ri]=thetas2[ri]*thetas2[opt_edges_thetas];//normalize
              spinupcons++;
            }
          }else{
            if(spinupcons==0){
              if(ri==0){
                thetas2[0]=0;
                thetas2[1]=df;
                spinupcons++;
              }//synchronize to first transition
            }else{
              thetas2[spinupcons+1]=thetas2[spinupcons]+df;
              spinupcons++;
              if(spinupcons==opt_edges_thetas){
                thetas2[opt_edges_thetas]=1.0f/thetas2[opt_edges_thetas];//normalize
              }
            }
          }
        //}
        break;
      case 15://operational, set slow update rate and spinupcons=0 and thetas[] and thetas2[] and reference_index and thrsh before entering
        l1=capture_ts_write;
        l2=(capture_ts_write-1)%opt_edges_nbuf;
        d1=capture_timestamps[l1]-capture_timestamps[l2];//theta segment speed
        if(d1>(65535/opt_edges_thetas) || d1==0){//minimum speed of ~18 Hz to allow for no wraparound subtraction over previous circle
          omega_est=d1*opt_edges_thetas;
          opt_state=9;
          err_slow++;
          digitalToggleFast(0);//for testing timing issues
          spinupcons=0;
          break;
        }
        n=(capture_ts_write)%opt_edges_thetas;
        //update omega estimator based on latest velocity
        dtheta=thetas[n]-thetas[(n-1)%opt_edges_thetas];
        if(dtheta<0){
          dtheta+=1.0f;
        }
        omega_iir=omega_alpha*omega_iir+(1-omega_alpha)*((float)d1/dtheta);
        omega_est=omega_iir;
        //update thetas estimator
        l2=(capture_ts_write-(opt_edges_thetas))%opt_edges_nbuf;
        d1=capture_timestamps[capture_ts_write]-capture_timestamps[l2];//circle speed
        if(d1==0){//avoid divide by zero error which would break the theta algorithm
          opt_state=9;
          err_slow++;
          spinupcons=0;
          break;
        }
        l1=(capture_ts_write-(opt_edges_thetas/2))%opt_edges_nbuf;
        l2=(l1+1)%opt_edges_nbuf;
        d2=capture_timestamps[l2]-capture_timestamps[l1];//one transition speed
        df=((float)d2)/(float)d1;//fraction of angle between transitions
        ri=(l1)%opt_edges_thetas;//index relative to reference
        ri=ri+1;
        thetas[ri]=theta_alpha*thetas[ri]+(1-theta_alpha)*(thetas2[ri]*thetas2[opt_edges_thetas]);
        thetas2[ri]=thetas2[ri-1]+df;
        if(ri==opt_edges_thetas){
          thetas2[ri]=1.0f/thetas2[ri];//normalize
          thetas2[0]=0;//not necessary
        }
        theta_est=thetas[n];
        //generate atomic variable to be used in timer synchronization
        float theta_pix=theta_est;//+theta_pixel_shift+(theta_pixel_vshift*omega_est);//pixel value corresponding to current theta estimate
        if(theta_pix<0.0f){
          theta_pix+=1.0f;
        }else{
          if(theta_pix>=1.0f){
            theta_pix-=1.0f;
          }
        }
        theta_pix*=opt_edges_thetas;//reduced theta index
        float tp_lst=floor(theta_pix);//most recent reduced whole theta index
        float tp_df=theta_pix-tp_lst;//partial reduced theta offset between the two
        float cycper=omega_est*(1.0f/(opt_edges_thetas));//ticks per reduced theta index
        uint16_t timer_back=(uint16_t)round(tp_df*cycper);
        uint16_t timer0=capture_timestamps[capture_ts_write]-timer_back;//expected timer value at previous whole theta
        uint16_t nval=(uint16_t)tp_lst;
        uint16_t cycpercir=(uint16_t)(cycper);//expected timer ticks per theta index
        ld_sync_var=((((uint32_t)timer0)<<16)|(nval<<((16-opt_edges_cirs)+opt_edges_dec))|(cycpercir));
        break;
    }
    
    capture_ts_write=(capture_ts_write+1)%opt_edges_nbuf;
    
    capture_total++;

    tel=opt1_el;
    if(tel>opt1_dur_max){
      opt1_dur_max=tel;
    }
    if(tel<opt1_dur_min){
      opt1_dur_min=tel;
    }

    //digitalWriteFast(2, LOW);//for debugging timing
  }

  asm volatile ("dsb");//memory barrier to ensure no re-entry (which would disrupt edge synchronization)
}
