Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LoRaWan - CubeCell cannot join most time (error -6) #1013

Closed
michapr opened this issue Mar 13, 2024 · 33 comments
Closed

LoRaWan - CubeCell cannot join most time (error -6) #1013

michapr opened this issue Mar 13, 2024 · 33 comments
Assignees
Labels
bug Something isn't working more information needed Further investigation is needed, not necessarily by the author

Comments

@michapr
Copy link

michapr commented Mar 13, 2024

Using the LoRaWAN_End_Device_Reference.ino the CubeCell 2 can initialize the Device but return error -6 most times.
In TTN I see the join process without errors.

Tried to skip the error in sketch, but sending does not work.

Sometimes the process will go as expected, values will be transmitted and I can see it in TTN.
If connected with data transfer - I have seen that SF10BW125 was used - this is not normal, other devices (Heltec WirelessStickV3 lite) are using SF7BW125, as expected...

#include <RadioLib.h>

SX1262 radio = new Module(RADIOLIB_BUILTIN_MODULE);

LoRaWANNode node(&radio, &EU868);

void setup() {
  Serial.begin(115200);

  Serial.print(F("[SX12xx] Initializing ... "));
  int state = radio.begin();
  if(state == RADIOLIB_ERR_NONE) {
    Serial.println(F("init success!"));
  } else {
    Serial.print(F("init failed, code "));
    Serial.println(state);
    while(true);
  }

  uint64_t joinEUI = 0x...;  //CubeCell
  uint64_t devEUI = 0x...;  //CubeCell
  uint8_t nwkKey[] = { ... };
  uint8_t appKey[] = { ... };

  Serial.print(F("[LoRaWAN] Attempting over-the-air activation ... "));
  state = node.beginOTAA(joinEUI, devEUI, nwkKey, appKey);

  if(state >= RADIOLIB_ERR_NONE) {
    Serial.println(F("join success!"));
  } else {
    Serial.print(F("join failed, code "));
    Serial.println(state);
    while(true);
  }

  node.setADR(true);
  node.setDutyCycle(true, 1250);
  node.setDwellTime(true, 1000);
}

void loop() {
  int state = RADIOLIB_ERR_NONE;
  uint8_t battLevel = 146;
  node.setDeviceStatus(battLevel);
  uint32_t fcntUp = node.getFcntUp();

  Serial.print(F("[LoRaWAN] Sending uplink packet ... "));
  String strUp = "Hello World! #" + String(fcntUp);
  Serial.print("upstream:");Serial.println(strUp);  

  if(fcntUp % 64 == 0) {
    state = node.uplink(strUp, 10, true);
    node.sendMacCommandReq(RADIOLIB_LORAWAN_MAC_LINK_CHECK);
    node.sendMacCommandReq(RADIOLIB_LORAWAN_MAC_DEVICE_TIME);
  } else {
    state = node.uplink(strUp, 10);
  }
  if(state == RADIOLIB_ERR_NONE) {
    Serial.println(F("success!"));
  } else {
    Serial.print(F("failed, code "));
    Serial.println(state);
  }

  Serial.print(F("[LoRaWAN] Waiting for downlink ... "));
  String strDown;

  LoRaWANEvent_t event;
  state = node.downlink(strDown, &event);
  if(state == RADIOLIB_ERR_NONE) {
    Serial.println(F("success!"));

    Serial.print(F("[LoRaWAN] Data:\t\t"));
    if(strDown.length() > 0) {
      Serial.println(strDown);
    } else {
      Serial.println(F("<MAC commands only>"));
    }

    Serial.print(F("[LoRaWAN] RSSI:\t\t"));
    Serial.print(radio.getRSSI());
    Serial.println(F(" dBm"));

    Serial.print(F("[LoRaWAN] SNR:\t\t"));
    Serial.print(radio.getSNR());
    Serial.println(F(" dB"));

    Serial.print(F("[LoRaWAN] Frequency error:\t"));
    Serial.print(radio.getFrequencyError());
    Serial.println(F(" Hz"));

    Serial.println(F("[LoRaWAN] Event information:"));
    Serial.print(F("[LoRaWAN] Direction:\t"));
    if(event.dir == RADIOLIB_LORAWAN_CHANNEL_DIR_UPLINK) {
      Serial.println(F("uplink"));
    } else {
      Serial.println(F("downlink"));
    }
    Serial.print(F("[LoRaWAN] Confirmed:\t"));
    Serial.println(event.confirmed);
    Serial.print(F("[LoRaWAN] Confirming:\t"));
    Serial.println(event.confirming);
    Serial.print(F("[LoRaWAN] Datarate:\t"));
    Serial.print(event.datarate);
    Serial.print(F("[LoRaWAN] Frequency:\t"));
    Serial.print(event.freq, 3);
    Serial.println(F(" MHz"));
    Serial.print(F("[LoRaWAN] Output power:\t"));
    Serial.print(event.power);
    Serial.println(F(" dBm"));
    Serial.print(F("[LoRaWAN] Frame count:\t"));
    Serial.println(event.fcnt);
    Serial.print(F("[LoRaWAN] Port:\t\t"));
    Serial.println(event.port);
    
    Serial.print(radio.getFrequencyError());

    uint8_t margin = 0;
    uint8_t gwCnt = 0;
    if(node.getMacLinkCheckAns(&margin, &gwCnt)) {
      Serial.print(F("[LoRaWAN] LinkCheck margin:\t"));
      Serial.println(margin);
      Serial.print(F("[LoRaWAN] LinkCheck count:\t"));
      Serial.println(gwCnt);
    }

    uint32_t networkTime = 0;
    uint8_t fracSecond = 0;
    if(node.getMacDeviceTimeAns(&networkTime, &fracSecond, true)) {
      Serial.print(F("[LoRaWAN] DeviceTime Unix:\t"));
      Serial.println(networkTime);
      Serial.print(F("[LoRaWAN] LinkCheck second:\t1/"));
      Serial.println(fracSecond);
    }
  
  } else if(state == RADIOLIB_ERR_RX_TIMEOUT) {
    Serial.println(F("timeout!"));
  
  } else {
    Serial.print(F("failed, code "));
    Serial.println(state);
  }

  uint32_t minimumDelay = 60000;                  // try to send once every minute
  uint32_t interval = node.timeUntilUplink();     // calculate minimum duty cycle delay (per law!)
	uint32_t delayMs = max(interval, minimumDelay); // cannot send faster than duty cycle allows
  Serial.print("delay(ms): ");Serial.println(delayMs);
  delay(delayMs);
}

Attached 2 verbose debug logs - first with returned error -6 for join, second with a normal transfer without error.
So maybe you can see the difference and have an idea what happen here in most times.

CubeCell_join_failed.log
CubeCell_join_data_ok.log

Screenshots
attached from TTN a screen with join (but sketch returned -6) and a screen with datatransfer
CubeCell_TTN_failed
CubeCell_TTN_ok

Additional info :

  • Heltec CubeCell 2.2 / HTCC-AB01
  • Arduino IDE version 2.2.1
  • Library version 6.4.2
@HeadBoffin
Copy link
Collaborator

Where is the gateway in relation to your device? Too close and they can overpower each other.

We are just in the process of preparing another beta release - I'll get the CubeCell tested over the next couple of days but from previous tests I'm not aware of any particular issues.

@michapr
Copy link
Author

michapr commented Mar 13, 2024

I also thought about it and was going outside with CubeCell - but no changes.
Made it again to make a picture:

CubeCell_TTN_failed2

Many "failed joins", but sometimes it is ok - but with SF10BW125

Gateway is in same room (4meters about) - but it was never a problem before with other devices (WSLV3, Heltec_WiFi32, FeatherM0) and the CubeCell was working with the Heltec library in this case too (was starting all experiments with Heltec lib...)

You cannot see anything "special" in logs?

@StevenCellist
Copy link
Collaborator

You cannot see anything "special" in logs?

All I can see is SPI output that at best translates to "avada kedrava" to me - usually my eyes and brain suffer much worse from all that nonsense.

To me, this issue very much appears to be down to some details in the hardware or whatever RF environment you are working in. Version 6.4.3 is coming soon™, which will make troubleshooting RadioLib a bit easier, but apart from the suggestions from @HeadBoffin (mostly distance, probably check antennae), I doubt we can help you here.
You may find some more useful tips on the TTN forum (using search button), where the topic of not hearing a join-accept comes up at least once a week. Searching for error -6 won't help you there, but searching for join-accept or not joining or so will.

@HeadBoffin
Copy link
Collaborator

SPI output is automagically run through an Enigma machine and can only be decoded by @jgromes. Although I think LoRaTracker speaks it natively.

Your RSSI is OK at -75 but I'm unsure how you end up joining, sending a uplink and then immediately sending a join request - did the board reset.

Did we mention that the protocol is still in beta?

TL;DR: Let me do a sanity / smoke test of 6.4.2 in the morning so we know if you should be able to get it to work.

@michapr
Copy link
Author

michapr commented Mar 13, 2024

Your RSSI is OK at -75 but I'm unsure how you end up joining, sending a uplink and then immediately sending a join request - did the board reset.

Do you mean between the failed joins?
Yes, I every time tried a Hardware-Reset - until I have got a transmission

Did we mention that the protocol is still in beta?

Yes, of course, but you wrote "comments and issues are welcome" (like that... ;) ...)

@michapr
Copy link
Author

michapr commented Mar 14, 2024

To me, this issue very much appears to be down to some details in the hardware or whatever RF environment you are working in.

Not sure about this, because device was working before with the Heltec lib without any problems (with LoRaWAN version 1.0.2 of course). I only wanted to switch to RadioLib and have seen this issue (using RadioLib for other devices now too).
Only that's why have reported.

Thanks!

@HeadBoffin
Copy link
Collaborator

I commented out the session restore that comes after the join in your code and looked at the text logs locally. If you have logs again, please can you post them without SPI and inline as downloading random text files causes issues with indexing and virus checking etc. The starter version of code coming soon will help as it will make the test code more compact.

The setup of the data structures on the CubeCell is borked - something very strange is going on that I've not seen before and goes way beyond the debug output we have at present to pin point something obvious.

Both myself & @StevenCellist have RL (that's Real Life) pressing us for some results. And we are behind the curve on getting the latest issues, persistence and examples out the door. So this issue is very much on the todo list but it will be a few days before we can dismantle the session setup code to figure out the why.

@StevenCellist StevenCellist added bug Something isn't working more information needed Further investigation is needed, not necessarily by the author labels Mar 14, 2024
@michapr
Copy link
Author

michapr commented Mar 14, 2024

I very much appreciate your work and the high level of commitment (outside the RL) to this project.
And it is clear to me that such problems cannot usually be solved immediately and need some time if the cause is not yet known in detail.

If you have logs again, please can you post them without SPI and inline

Do you mean after the init procedure, beginning with state = node.beginOTAA(joinEUI, devEUI, nwkKey, appKey); ?
(regarding "without SPI")

Will be happy to do it.

@StevenCellist
Copy link
Collaborator

By "without SPI" we mean that you do not enable RADIOLIB_VERBOSE, but only RADIOLIB_DEBUG. Most of the issues related to LoRaWAN are bugs in the code, not problems with the radio. Therefore, we don't need to see the radio SPI communication.
In the next RadioLib update this will be much more clear, fortunately.

@michapr
Copy link
Author

michapr commented Mar 14, 2024

14:11:03.412 -> Copyright @2019-2020 Heltec Automation.All rights reserved.
14:11:03.675 -> [SX12xx] Initializing ... 
14:11:03.675 -> RadioLib Debug Info
14:11:03.675 -> Version:  6.4.2.0
14:11:03.675 -> Platform: CubeCell
14:11:03.675 -> Compiled: Mar 14 2024 14:00:35
14:11:03.675 -> 
14:11:03.675 -> Found SX126x: RADIOLIB_SX126X_REG_VERSION_STRING:
14:11:03.675 -> 0000320 53 58 31 32 36 31 20 56 32 44 20 32 44 30 32 00 | SX1261 V2D 2D02.
14:11:03.718 -> 
14:11:03.718 -> M	SX126x
14:11:03.750 -> init success!
14:11:03.750 -> [LoRaWAN] Attempting over-the-air activation ... 
14:11:03.750 -> Didn't restore session (checksum: 0, mode: 0)
14:11:03.750 -> First 16 bytes of NVM:
14:11:03.750 -> 0000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 | ................
14:11:03.750 -> Wiping EEPROM and starting a clean session
14:11:03.750 -> Setting up dynamic channels
14:11:03.750 -> Channel UL/DL 0 frequency =  MHz
14:11:03.750 -> Channel UL/DL 1 frequency =  MHz
14:11:03.750 -> Channel UL/DL 2 frequency =  MHz
14:11:03.783 -> exe MAC CID = 03, len = 4
14:11:03.783 -> ADR REQ: dataRate = 2, txPower = 0, chMask = 0x0000, chMaskCntl = 00, nbTrans = 0
14:11:03.783 -> DR 72: LORA (SF: 10, BW: , CR: 5)
14:11:03.783 ->  ADR ANS: status = 0x07
14:11:03.783 -> exe MAC CID = 04, len = 1
14:11:03.783 -> Max duty cycle: 1/2^7
14:11:03.783 -> exe MAC CID = 05, len = 4
14:11:03.783 -> Rx param REQ: rx1DrOffset = 0, rx2DataRate = 0, freq = 
14:11:03.783 -> Rx param ANS: status = 0x07
14:11:03.830 -> exe MAC CID = 08, len = 1
14:11:03.830 -> RX timing: delay = 1 sec
14:11:03.830 -> exe MAC CID = 09, len = 1
14:11:03.830 -> TX timing: dlDwell = 0, ulDwell = 0, maxEirp = 16 dBm
14:11:03.830 -> exe MAC CID = 0c, len = 1
14:11:03.830 -> ADR param setup: limitExp = 6, delayExp = 5
14:11:03.830 -> exe MAC CID = 0f, len = 1
14:11:03.830 -> Rejoin setup: maxTime = 15, maxCount = 10
14:11:03.924 -> 
14:11:03.924 -> Channel frequency UL =  MHz
14:11:03.924 -> DR 72: LORA (SF: 10, BW: , CR: 5)
14:11:03.924 -> Timeout in 556032 us
14:11:04.284 -> Join-request sent <-- Rx Delay start
14:11:04.329 -> 
14:11:04.329 -> Channel frequency DL =  MHz
14:11:04.329 -> DR 72: LORA (SF: 10, BW: , CR: 5)
14:11:04.329 -> Opening Rx1 window (226848 us timeout)... <-- Rx Delay end 
14:11:09.616 -> closing
14:11:09.616 -> DR 104: LORA (SF: 12, BW: , CR: 5)
14:11:09.616 -> �Opening Rx2 window (683552 us timeout)... <-- Rx Delay end 
14:11:11.030 -> closing
14:11:11.030 -> join failed, code -6

TTN image with verbose output:
grafik

BTW: session restore was not inside my code - copy/paste error only... (have removed in above code)

@HeadBoffin
Copy link
Collaborator

Thanks for this. There is no need to send any more information at this time - I've replicated the issue and will get to work on it - each time someone posts we have to choose, do we ignore them & seem rude or do we divert from the tasks at hand & give a reply.

Feel free to post if you find a fix, but in the meanwhile your other devices will serve you better.

@StevenCellist
Copy link
Collaborator

I will help you with a piece of the puzzle: the %f float specifier is known to be borked for CubeCell. Doesn't make life easier for us, but it is likely that the channel setup is not the problem here, even though the information for frequencies is blanked out. Will need to dig deeper...
I thought at some point @jgromes applied a fix for the float specifier, but I don't know if that was for CubeCell.

@jgromes
Copy link
Owner

jgromes commented Mar 14, 2024

Bit off-topic, but just to answer @StevenCellist - there's no fix for missing printf("%f") on Arduino platforms that don't support it. Instead you have to use RADIOLIB_DEBUG_<level>_PRINT_FLOAT macros to consistently print floats. Or (and this one is less annoying to use) don't print floats at all - mutliply the value by e.g. 1000 and then print the integer.

@michapr
Copy link
Author

michapr commented Mar 14, 2024

Also a bit "off-topic" - it was a good hint about the printf("%f") . Was not aware about this, in normal case it is available (for ESP32 etc...)
Have included the LibPrintf library in BuildOptUser.h and see now the values for freq and BW ;)

@StevenCellist
Copy link
Collaborator

@jgromes would that be a default option? No idea.. just a thought

@jgromes
Copy link
Owner

jgromes commented Mar 15, 2024

@StevenCellist I don't think it's a good idea to pull in 3rd party dependency (https://github.com/embeddedartistry/arduino-printf) if that's what you mean (and if that library I linked is the LibPrintf @michapr meant).

Let's not hijack this thread, if anyone has any more points about float printing, feel free to open a Discussion ;)

@StevenCellist
Copy link
Collaborator

Issue now also replicated by me after digging up an old Cubecell board.
The problem is the clock drift on these ASR650x chips. The following code...

for(int i = 0; i < 10; i++) {
	Serial.println(i);
	delay(1000);
}

... produces the following timestamps:

19:20:47.790 > 0
19:20:48.805 > 1
19:20:49.821 > 2
19:20:50.837 > 3
19:20:51.854 > 4
19:20:52.869 > 5
19:20:53.886 > 6
19:20:54.902 > 7
19:20:55.919 > 8
19:20:56.935 > 9

The average clock drift is about 16 milliseconds. For PlatformIO, adding the following build flag solves the problem:

build_flags = -D RADIOLIB_CLOCK_DRIFT_MS=16

Alternatively, you can set this in BuildOptUser.h:

#define RADIOLIB_CLOCK_DRIFT_MS 16

Please note that you should measure the clock drift of your device yourself: it may not be 16 ms, but rather 14 or 18 for example. You can do so easily by using something like the loop I showed, and adding timestamps to your terminal (integrated button in Arduino IDE serial monitor, or using the monitor_flags = time option in the platformio.ini file.

@StevenCellist
Copy link
Collaborator

StevenCellist commented Mar 20, 2024

While keeping it running for a while, I now see that the clock drift compensation may mess up something unintentionally:

19:26:39.876 > Sending uplink
19:26:39.947 > RLB_PRO: 
19:26:39.948 > RLB_PRO: PHY: Frequency UL = 867.700 MHz
19:26:39.958 > RLB_PRO: PHY: SF = 7, BW = 125.000 kHz, CR = 4/5
19:26:39.965 > RLB_PRO: Uplink (FcntUp = 5) decoded:
19:26:39.969 > RLB_PRO: 0000000 10 00 00 00 fc ff ff ff 0c 00 00 00 fc ff ff ff | ................
19:26:39.977 > RLB_PRO: 0000010 40 17 cc 0b 26 80 05 00 01 b4 65 03 00 01 00 94 | @...&.....e.....
19:26:40.056 > RLB_PRO: Uplink sent <-- Rx Delay start
19:26:40.059 > Error in sendReceive (-5)
19:26:40.063 > Sleeping
19:26:40.063 >
19:27:41.020 > Sending uplink
19:27:41.022 > Error in sendReceive (-1108)
19:27:41.024 > Sleeping

@jgromes could you have a look at the implementation of RADIOLIB_CLOCK_DRIFT_MS in ArduinoHAL.cpp / do you have an idea why this would result in the behaviour observed?

EDIT: the module source files (SX126x.cpp in this case) include the following:

  if(this->mod->hal->micros() - start > timeout) {
      finishTransmit();
      return(RADIOLIB_ERR_TX_TIMEOUT);
    }

start is compensated for clock drift and so is micros(), but timeout is not. Although it is set for 150% of the expected time-on-air.. so I would think that it would still work for a clock-drift of 16ms on 1000ms..?

@michapr
Copy link
Author

michapr commented Mar 22, 2024

(sorry for delay, was not available some days...)

The average clock drift is about 16 milliseconds.

For my CubeCell (not an "old" ;) ...) this are also 16ms...

Have added #define RADIOLIB_CLOCK_DRIFT_MS 16 - and now the join process is ok, even after reset etc.

But the receive data is not working, I think it is the same for you?

19:52:38.622 -> Uplink sent <-- Rx Delay start
19:52:38.622 -> success!
19:52:38.622 -> [LoRaWAN] Waiting for downlink ... 
19:52:38.667 -> Channel frequency DL = 867.700012 MHz
19:52:38.667 -> DR 72: LORA (SF: 10, BW: 125.000000, CR: 5)
19:52:43.631 -> Opening Rx1 window (226848 us timeout)... <-- Rx Delay end 
19:52:43.864 -> closing
19:52:43.943 -> Downlink message length: 15
19:52:43.943 -> downlinkMsg:
19:52:43.943 -> 0000000 49 01 00 00 00 01 a3 63 0b 26 00 00 00 00 00 0b | I......c.&......
19:52:43.943 -> 0000010 60 a3 63 0b 26 a3 00 00 06 15 45 60 92 bb 8c    | `.c.&.....E`...   
19:52:43.943 -> FOpts: A3
19:52:43.943 -> FOptsLen: 3
19:52:43.943 -> fcnt: 0, fcntPrev: 0, isAppDownlink: 0
19:52:43.989 -> fopts:
19:52:43.990 -> 0000000 0b 01 06 

@jgromes
Copy link
Owner

jgromes commented Mar 23, 2024

@StevenCellist regarding the timeout in Tx, I don't think that should be caused by the clock drift being set. The point of the RADIOLIB_CLOCK_DRIFT_MS macro is to scale the output from micros/millis from the "drifted" value to the "real" value, that's why start and the current timestamp is scaled, but timeout is not - it's already at the "real" value. In any case, 150% should be more than enough to cover for this in any case.

@michapr
Copy link
Author

michapr commented Mar 23, 2024

But the receive data is not working, I think it is the same for you?

It's depending on the node.setADR() as I have seen now. I had tried before with (false) because of the SF10BW125 ...
With node.setADR(true) I get following:

...
10:01:45.362 -> Uplink sent <-- Rx Delay start
10:01:45.362 -> success!
10:01:45.362 -> [LoRaWAN] Waiting for downlink ... 
10:01:45.362 -> Channel frequency DL = 868.099976 MHz
10:01:45.405 -> DR 72: LORA (SF: 10, BW: 125.000000, CR: 5)
10:01:50.363 -> Opening Rx1 window (226848 us timeout)... <-- Rx Delay end 
10:01:50.581 -> closing
10:01:50.581 -> DR 56: LORA (SF: 9, BW: 125.000000, CR: 5)
10:01:51.452 -> Opening Rx2 window (123424 us timeout)... <-- Rx Delay end 
10:01:51.591 -> closing
10:01:51.591 -> timeout!
10:01:51.591 -> delay(ms): 1793890

But I see in TTN SF10BW125.

@StevenCellist
Copy link
Collaborator

@michapr you seem to not exactly catch the details on LoRaWAN. The idea is that you almost never receive a downlink. Fewer downlinks is better. The "timeout" message is a bit misleading there, and is removed in the current version that should be released in a couple of days.

@jgromes I do agree that 150% should be plenty of headroom, but I also never saw it happen on any ESP32 and it occurs on the ASR after only a few uplinks already, every time.. I hope to check it out a bit more soon, but there must be some detail there..

@HeadBoffin
Copy link
Collaborator

But the receive data is not working, I think it is the same for you?

Your LoRaWAN experience can be enhanced by a huge percent, possibly as much as a gazillion, by ensuring you are up on the fundamentals. https://www.thethingsnetwork.org/docs/lorawan/ pretty much covers the super important details but don't try to watch the whole video in one go!

@michapr
Copy link
Author

michapr commented Mar 23, 2024

@HeadBoffin - sory for the wrong wording about "downlink" - I was thinking about the downlink procedure (code) after
"[LoRaWAN] Waiting for downlink ... "
I was not speaking about the receiving a downlink (I know that there must not be any) - and was not thinking about the "timeout" - this this is only a "status" and not an error, have seen this also in code.
It's probably not good to write in a very condensed manner here because not everyone can follow other people's thoughts.

Maybe it is better for me to leave this discussion here because you assume that I don't seem to match your level of experience

@StevenCellist
Copy link
Collaborator

An update on this: it turns out that the problem is quite fundamental to the CubeCell platform - more specifically, it appears to be due to the way that micros() overflows. There are some sketchy ways around this, but actually solving the problem seems to be a harder challenge. For the near future, the answer would be: "ASR650x / CubeCell is not well supported, please use something else". Maybe we / I / @jgromes can come up with a good solution at some point, but at least for me it's not worth prioritizing my time for it.

@StevenCellist
Copy link
Collaborator

StevenCellist commented Mar 25, 2024

Rollover/overflow isn't really the issue here apparently (which I expected), but there are so many weird things with the CubeCell platform.
See the next sketch for example:

#include <Arduino.h>

uint32_t last = 0;

void setup() {
    Serial.begin(115200);
    delay(2000);
    while(!Serial);
}

void loop() {
    uint32_t now = micros();
    Serial.println(now);
    if(now < last) {
        Serial.println("Stopped!");
        Serial.println(now);
        Serial.println(last);
        delay(900);
    }
    last = now;
}

now should never be smaller than last, yet I get the following output about every other 5 seconds (btu irregularly):

12:38:27.451 > Stopped!
12:38:27.456 > 92596866
12:38:27.458 > 92596999

Interestingly, the last value that is printed, always ends in 999 or sometimes 998 or 997. However, removing the line Serial.println(now); makes it such that the problem doesn't happen... why?????

millis() appears to be 'rocksolid' from the perspective of this sketch however; it runs fine and never results in "Stopped!".

@michapr
Copy link
Author

michapr commented Mar 25, 2024

@StevenCellist - yes, you are right, looks like a bug there. I have seen before that micros() function is using the internal system ticker but millis() the RTC clock.

http://community.heltec.cn/t/cubecell-micros-function/985/2
(BTW: it's also very interesting to print out the "now - last" loop cycle value ... ;) )

If you like I can open at Heltec an issue and follow up this there - so you don't need it to do.

@StevenCellist
Copy link
Collaborator

If you like I can open at Heltec an issue and follow up this there - so you don't need it to do.

Ehm, be my guest, but I don't expect anything. The platform is outdated, it's not maintained anymore given the 107 open issues, most without reactions.

@jgromes may implement a change from micros() to millis() in the transmit() function which is more likely to successfully handle this problem.

@jgromes
Copy link
Owner

jgromes commented Mar 25, 2024

@StevenCellist

However, removing the line Serial.println(now); makes it such that the problem doesn't happen... why?????

Without having any idea how CubeCell works, issues with timers that appear and disappear when interacting with another periphery (UART in this case) smells like badly configured interrupts to me. In any case out of scope for us.

@michapr

As @StevenCellist posted, I will switch timeout detection from micros to millis after we get 6.5.0 released. I'm afraid that's as much as we can do from our end.

@jgromes jgromes self-assigned this Mar 25, 2024
jgromes added a commit that referenced this issue Mar 29, 2024
@jgromes
Copy link
Owner

jgromes commented Mar 29, 2024

@michapr as promised, timeout detection is switched from micros to millis, which should resolve some of the weirdness of CubeCell. However, fully fixing it's issues with millis and micros is fairly out-of-scope for this library, so I suggest taking this to the CubeCell maintainers.

@michapr
Copy link
Author

michapr commented Mar 31, 2024

Thanks, CubeCell is working now (more or less) stable, no reconnects, no hanging.
ADR is not working for me, always SF10BW125 -maybe this is also microseconds() related?
(if using setDatarate(5) get SF7BW125 as expected...)

And - looks like the same problems with
sendMacCommandReq(RADIOLIB_LORAWAN_MAC_LINK_CHECK) and
sendMacCommandReq(RADIOLIB_LORAWAN_MAC_DEVICE_TIME)
as in #1040 - I had to disable it in the LoRaWAN_Reference.ino sketch.
But also maybe that this is here more CubeCell related... ;)

@michapr
Copy link
Author

michapr commented Apr 1, 2024

I'm afraid I have to correct myself...
Device is still hanging after some time.

13:02:44.322 -> [LoRaWAN] Next uplink in 180s
13:05:47.034 -> RLB_PRO: 
13:05:47.034 -> RLB_PRO: PHY: Frequency UL =  MHz
13:05:47.034 -> RLB_PRO: PHY: SF = 7, BW =  kHz, CR = 4/5
13:05:47.034 -> RLB_PRO: Uplink MAC payload (1 commands):
13:05:47.034 -> RLB_PRO: 0000000 0b 01                                           | ..                
13:05:47.082 -> RLB_PRO: Uplink (FcntUp = 64) decoded:
13:05:47.082 -> RLB_PRO: 0000000 6c 67 74 c1 90 65 5c 50 28 00 00 00 fc ff ff ff | lgt..e\P(.......
13:05:47.082 -> RLB_PRO: 0000010 40 5a c9 0b 26 02 40 00 9d 32 0a a7 c3 4c ff ff | @Z..&[email protected]..
13:05:47.082 -> RLB_PRO: 0000020 a2 22                                           | ."                
13:05:47.114 -> RLB_DBG: Timeout in 257 ms
13:05:47.146 -> RLB_PRO: Uplink sent <-- Rx Delay start
13:05:47.146 -> RLB_PRO: 
13:05:47.146 -> RLB_PRO: PHY: Frequency DL =  MHz
13:05:47.190 -> RLB_PRO: PHY: SF = 7, BW =  kHz, CR = 4/5
13:05:47.190 -> RLB_PRO: Opening Rx1 window (45856 us timeout)... <-- Rx Delay end 
13:05:52.216 -> RLB_PRO: Closing Rx1 window
13:05:52.216 -> RLB_PRO: Downlink (NFcntDown = 64) encoded:
13:05:52.216 -> RLB_PRO: 0000000 49 00 00 00 00 01 5a c9 0b 26 40 00 00 00 00 0b | I.....Z..&@.....
13:05:52.229 -> RLB_PRO: 0000010 60 5a c9 0b 26 83 40 00 ad e8 0b 7f 91 1d 0a    | `Z..&.@........   
13:05:52.229 -> <MAC commands only>
13:05:52.229 -> [LoRaWAN] RSSI:		-18.00 dBm
13:05:52.229 -> [LoRaWAN] SNR:		14.00 dB
13:05:52.229 -> [LoRaWAN] Frequency error:	1619.75 Hz
13:05:52.229 -> [LoRaWAN] Event information:
13:05:52.229 -> [LoRaWAN] Confirmed:	0
13:05:52.271 -> [LoRaWAN] Confirming:	1
13:05:52.271 -> [LoRaWAN] Datarate:	5
13:05:52.271 -> [LoRaWAN] Frequency:	868.100 MHz
13:05:52.271 -> [LoRaWAN] Output power:	10 dBm
13:05:52.271 -> [LoRaWAN] Frame count:	1
13:05:52.271 -> [LoRaWAN] Port:		0
13:05:52.271 -> [LoRaWAN] Next uplink in 180s
13:08:54.892 -> [LoRaWAN] Requesting LinkCheck and DeviceTime
13:08:54.960 -> RLB_PRO: 
13:08:54.960 -> RLB_PRO: PHY: Frequency UL =  MHz
13:08:54.960 -> RLB_PRO: PHY: SF = 7, BW =  kHz, CR = 4/5
13:08:54.996 -> RLB_PRO: Uplink (FcntUp = 65) decoded:
13:08:54.996 -> RLB_PRO: 0000000 10 00 00 00 fc ff ff ff 0c 00 00 00 fc ff ff ff | ................
13:08:54.996 -> RLB_PRO: 0000010 80 5a c9 0b 26 00 41 00 0a 4d 2a 1f 00 1d 0a 2b | .Z..&.A..M*....+
13:08:55.029 -> RLB_DBG: Timeout in 257 ms
13:08:55.061 -> RLB_PRO: Uplink sent <-- Rx Delay start
13:08:55.097 -> RLB_PRO: 
13:08:55.097 -> RLB_PRO: PHY: Frequency DL =  MHz
13:08:55.097 -> RLB_PRO: PHY: SF = 7, BW =  kHz, CR = 4/5
13:08:55.097 -> RLB_PRO: Opening Rx1 window (45856 us timeout)... <-- Rx Delay end 
13:09:00.124 -> RLB_PRO: Closing Rx1 window
13:09:00.124 -> RLB_PRO: Downlink (NFcntDown = 65) encoded:
13:09:00.124 -> RLB_PRO: 0000000 49 41 00 00 00 01 5a c9 0b 26 41 00 00 00 00 09 | IA....Z..&A.....
13:09:00.149 -> RLB_PRO: 0000010 60 5a c9 0b 26 a1 41 00 cd 8a 81 10 10          | `Z..&.A...... 

Nothing more happen until Reset.... Have seen multiple times.
(still using LoRaWAN_Reference.ino with setADR(false), setDatarate(5), setTxPower(10) in setup() ...)

@StevenCellist
Copy link
Collaborator

Well, it happens after the MAC requests, so it's still the same theme as in the Discussions. Will investigate, later.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working more information needed Further investigation is needed, not necessarily by the author
Projects
None yet
Development

No branches or pull requests

4 participants