-
Notifications
You must be signed in to change notification settings - Fork 209
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Delete statefile when tailer terminates due to an error #457
Changes from all commits
973e659
fb1032e
49b9d0b
74c836f
7060987
98f6f86
17d05cd
2233ae9
f28965e
3c9a4cd
fe79755
0cdd0e6
ce7b1b1
7cac8c1
ed18fba
2ded347
363b4bf
817cd32
e30cc0d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
{ | ||
"agent": { | ||
"run_as_user": "root" | ||
}, | ||
"logs": { | ||
"logs_collected": { | ||
"files": { | ||
"collect_list": [ | ||
{ | ||
"file_path": "/tmp/rotate_me.log*", | ||
"log_group_name": "{instance_id}", | ||
"log_stream_name": "{instance_id}Rotated", | ||
"timezone": "UTC" | ||
} | ||
] | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
""" | ||
Lifted this from https://github.com/aws/amazon-cloudwatch-agent/issues/447 | ||
because I was not able to adequately reproduce the issue natively in Go, | ||
directly in the integration test code. | ||
""" | ||
import json | ||
import logging | ||
import time | ||
from logging.handlers import TimedRotatingFileHandler | ||
|
||
# get root logger | ||
logger = logging.getLogger() | ||
logger.setLevel(logging.INFO) | ||
|
||
# rotate our log file every 10 seconds | ||
handler = TimedRotatingFileHandler("/tmp/rotate_me.log", when="S", interval=10) | ||
logger.addHandler(handler) | ||
|
||
# log a message | ||
logging.info(json.dumps({"Metric": "12345"*10})) | ||
# sleep so that file will rotate upon next log message | ||
time.sleep(15) | ||
# log another message (this one will not appear since byte length of message == byte length of old log file) | ||
logging.info(json.dumps({"Metric": "09876"*10})) | ||
# sleep again so that file will rotate upon next log message | ||
time.sleep(15) | ||
# this message will be partially written | ||
logging.info({"Metric": "1234567890"*10}) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ import ( | |
"context" | ||
"errors" | ||
"log" | ||
"strings" | ||
"testing" | ||
"time" | ||
|
||
|
@@ -104,6 +105,65 @@ func DeleteLogGroupAndStream(logGroupName, logStreamName string) { | |
} | ||
} | ||
|
||
// ValidateLogsInOrder takes a log group, log stream, a list of specific log lines and a timestamp. | ||
// It should query the given log stream for log events, and then confirm that the log lines that are | ||
// returned match the expected log lines. This also sanitizes the log lines from both the output and | ||
// the expected lines input to ensure that they don't diverge in JSON representation (" vs ') | ||
func ValidateLogsInOrder(t *testing.T, logGroup, logStream string, logLines []string, since time.Time) { | ||
log.Printf("Checking %s/%s since %s for %d expected logs", logGroup, logStream, since.UTC().Format(time.RFC3339), len(logLines)) | ||
cwlClient, clientContext, err := getCloudWatchLogsClient() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to return the error to begin with? We can still put
inside the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doing that means changing the function to accept a func getCloudwatchLogsClient(t *testing.T) {} I am indifferent on how we do it. The end result is the same. I just didn't see much reason to pass the testing struct down another level. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am fine in passing down a struct at a deeper level since it would reduce code and main reason was: Do we let each function handles custom respond to the error? Since it is not that's why I would prefer passing down; however, its fine for you to keep it since the end result is the same. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I disagree with the current approach for the CloudWatch metrics client creation in |
||
if err != nil { | ||
t.Fatalf("Error occurred while creating CloudWatch Logs SDK client: %v", err.Error()) | ||
} | ||
|
||
sinceMs := since.UnixNano() / 1e6 // convert to millisecond timestamp | ||
|
||
// https://docs.aws.amazon.com/AmazonCloudWatchLogs/latest/APIReference/API_GetLogEvents.html | ||
// GetLogEvents can return an empty result while still having more log events on a subsequent page, | ||
// so rather than expecting all the events to show up in one GetLogEvents API call, we need to paginate. | ||
params := &cloudwatchlogs.GetLogEventsInput{ | ||
LogGroupName: aws.String(logGroup), | ||
LogStreamName: aws.String(logStream), | ||
StartTime: aws.Int64(sinceMs), | ||
StartFromHead: aws.Bool(true), // read from the beginning | ||
} | ||
|
||
foundLogs := make([]string, 0) | ||
var output *cloudwatchlogs.GetLogEventsOutput | ||
var nextToken *string | ||
|
||
for { | ||
if nextToken != nil { | ||
params.NextToken = nextToken | ||
} | ||
output, err = cwlClient.GetLogEvents(*clientContext, params) | ||
|
||
if err != nil { | ||
t.Fatalf("Error occurred while getting log events: %v", err.Error()) | ||
} | ||
|
||
for _, e := range output.Events { | ||
foundLogs = append(foundLogs, *e.Message) | ||
} | ||
|
||
if nextToken != nil && output.NextForwardToken != nil && *output.NextForwardToken == *nextToken { | ||
// From the docs: If you have reached the end of the stream, it returns the same token you passed in. | ||
log.Printf("Done paginating log events for %s/%s and found %d logs", logGroup, logStream, len(foundLogs)) | ||
break | ||
} | ||
|
||
nextToken = output.NextForwardToken | ||
} | ||
|
||
// Validate that each of the logs are found, in order and in full. | ||
assert.Len(t, foundLogs, len(logLines)) | ||
for i := 0; i < len(logLines); i++ { | ||
expected := strings.ReplaceAll(logLines[i], "'", "\"") | ||
actual := strings.ReplaceAll(foundLogs[i], "'", "\"") | ||
assert.Equal(t, expected, actual) | ||
} | ||
} | ||
|
||
// getCloudWatchLogsClient returns a singleton SDK client for interfacing with CloudWatch Logs | ||
func getCloudWatchLogsClient() (*cloudwatchlogs.Client, *context.Context, error) { | ||
if cwl == nil { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that the failure is better understood, do you think you could write Golang code instead of calling the python script?
I am fine with keeping the Python script, but I just wanted to check.
Or maybe it is better to keep the python logger code since it test using something (the logger) that users will actually use. Compared to custom Golang code that we write just to mimic the behavior.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm kind of on the fence about it. I can try to spend some time today to try doing it natively in Go again, but not at a higher priority than fixing this windows unit test failure. What's annoying is I manually tested on Windows and of course there's no issue then 🙄
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tried again briefly and didn't get it right. Pasting the function I wrote for deleting/recreating the log file in case there's something obvious I am missing. I just run this in a loop for the different log lines, and sleep a decent amount of time in between each write. I ran integration tests on my fork and it couldn't even find the log stream, and I don't think I want to invest much more time with this. I think framing it with "something that users will actually use" is good enough of a
excusejustification to me to keep the python script.