Skip to content

Commit cfe16bf

Browse files
committed
[PDR-16012][feat]logkit发送接口延迟,内部queue长度指标开发
1 parent 01200fd commit cfe16bf

File tree

6 files changed

+123
-19
lines changed

6 files changed

+123
-19
lines changed

mgr/metric_runner.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ func (r *MetricRunner) Run() {
295295
dataCnt := 0
296296
datas := make([]Data, 0)
297297
metricTime := time.Now()
298-
tags[metric.Timestamp] = metricTime.Format(time.RFC3339Nano)
298+
tags[metric.Timestamp] = metricTime.UnixNano()/1e6
299299
for _, c := range r.collectors {
300300
metricName := c.Name()
301301
tmpdatas, err := c.Collect()
@@ -610,10 +610,14 @@ func (mr *MetricRunner) StatusRestore() {
610610
}
611611
sStatus, ok := s.(sender.StatsSender)
612612
if ok {
613-
sStatus.Restore(&StatsInfo{
613+
statsInfo:=&StatsInfo{
614614
Success: info[0],
615615
Errors: info[1],
616-
})
616+
}
617+
if len(info)>2{
618+
statsInfo.FtSendLag=info[2]
619+
}
620+
sStatus.Restore(statsInfo)
617621
}
618622
status, ext := mr.rs.SenderStats[name]
619623
if !ext {
@@ -635,7 +639,7 @@ func (mr *MetricRunner) StatusBackup() {
635639
status.ParserStats.Success,
636640
status.ParserStats.Errors,
637641
},
638-
SenderCnt: map[string][2]int64{},
642+
SenderCnt: map[string][]int64{},
639643
}
640644
for _, s := range mr.senders {
641645
name := s.Name()
@@ -646,9 +650,10 @@ func (mr *MetricRunner) StatusBackup() {
646650
status.SenderStats[name] = senderStats
647651
}
648652
if sta, exist := status.SenderStats[name]; exist {
649-
bStart.SenderCnt[name] = [2]int64{
653+
bStart.SenderCnt[name] = []int64{
650654
sta.Success,
651655
sta.Errors,
656+
sta.FtSendLag,
652657
}
653658
}
654659
}

mgr/runner.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,10 +1476,14 @@ func (r *LogExportRunner) StatusRestore() {
14761476
}
14771477
sStatus, ok := s.(sender.StatsSender)
14781478
if ok {
1479-
sStatus.Restore(&StatsInfo{
1479+
statsInfo:=&StatsInfo{
14801480
Success: info[0],
14811481
Errors: info[1],
1482-
})
1482+
}
1483+
if len(info)>2{
1484+
statsInfo.FtSendLag=info[2]
1485+
}
1486+
sStatus.Restore(statsInfo)
14831487
}
14841488
status, ext := r.rs.SenderStats[name]
14851489
if !ext {
@@ -1519,7 +1523,7 @@ func (r *LogExportRunner) StatusBackup() {
15191523
status.ParserStats.Errors,
15201524
},
15211525
TransCnt: map[string][2]int64{},
1522-
SenderCnt: map[string][2]int64{},
1526+
SenderCnt: map[string][]int64{},
15231527
}
15241528
r.historyMutex.Lock()
15251529
defer r.historyMutex.Unlock()
@@ -1535,9 +1539,10 @@ func (r *LogExportRunner) StatusBackup() {
15351539
for idx, t := range r.transformers {
15361540
name := formatTransformName(t.Type(), idx)
15371541
sta := t.Stats()
1538-
bStart.SenderCnt[name] = [2]int64{
1542+
bStart.SenderCnt[name] = []int64{
15391543
sta.Success,
15401544
sta.Errors,
1545+
sta.FtSendLag,
15411546
}
15421547
}
15431548

@@ -1563,9 +1568,10 @@ func (r *LogExportRunner) StatusBackup() {
15631568
status.SenderStats[name] = senderStats
15641569
}
15651570
if sta, exist := status.SenderStats[name]; exist {
1566-
bStart.SenderCnt[name] = [2]int64{
1571+
bStart.SenderCnt[name] = []int64{
15671572
sta.Success,
15681573
sta.Errors,
1574+
sta.FtSendLag,
15691575
}
15701576
}
15711577
}

reader/meta.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ const (
4848
type Statistic struct {
4949
ReaderCnt int64 `json:"reader_count"` // 读取总条数
5050
ParserCnt [2]int64 `json:"parser_connt"` // [解析成功, 解析失败]
51-
SenderCnt map[string][2]int64 `json:"sender_count"` // [发送成功, 发送失败]
51+
SenderCnt map[string][]int64 `json:"sender_count"` // [发送成功, 发送失败]
5252
TransCnt map[string][2]int64 `json:"transform_count"` // [解析成功, 解析失败]
5353
ReadErrors ErrorStatistic `json:"read_errors"`
5454
ParseErrors ErrorStatistic `json:"parse_errors"`

reader/meta_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ func TestMeta(t *testing.T) {
109109
stat := &Statistic{
110110
ReaderCnt: 6,
111111
ParserCnt: [2]int64{6, 8},
112-
SenderCnt: map[string][2]int64{
112+
SenderCnt: map[string][]int64{
113113
"aaa": {1, 2},
114114
"bbb": {5, 6},
115115
},

sender/fault_tolerant.go

Lines changed: 98 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ import (
44
"encoding/json"
55
"errors"
66
"fmt"
7+
"io/ioutil"
78
"math"
89
"os"
10+
"path/filepath"
911
"strconv"
1012
"strings"
1113
"sync"
@@ -36,6 +38,8 @@ const (
3638
KeyUnMarshalError = "Data unmarshal failed"
3739
// NumUnMarshalError
3840
NumUnMarshalError = 10
41+
// lag file
42+
LagFilename = "meta.lag"
3943
)
4044

4145
var _ SkipDeepCopySender = &FtSender{}
@@ -206,6 +210,9 @@ func newFtSender(innerSender Sender, runnerName string, opt *FtOption) (*FtSende
206210
isBlock: opt.isBlock,
207211
backoff: utils.NewBackoff(2, 1, 1*time.Second, 5*time.Minute),
208212
}
213+
ftSender.statsMutex.Lock()
214+
ftSender.stats.FtSendLag = ftSender.readLag()
215+
ftSender.statsMutex.Unlock()
209216

210217
if opt.innerSenderType == TypePandora {
211218
ftSender.pandoraKeyCache = make(map[string]KeyInfo)
@@ -273,9 +280,17 @@ func (ft *FtSender) RawSend(datas []string) error {
273280
} else {
274281
// se 中的 lasterror 和 senderror 都为空,需要使用 se.FtQueueLag
275282
se.AddSuccessNum(len(datas))
283+
ft.statsMutex.Lock()
284+
ft.stats.FtSendLag = ft.stats.FtSendLag + int64(len(datas))
285+
ft.statsMutex.Unlock()
276286
ft.backoff.Reset()
277287
}
278288
se.FtQueueLag = ft.BackupQueue.Depth() + ft.logQueue.Depth()
289+
if se.FtQueueLag == 0 {
290+
ft.statsMutex.Lock()
291+
ft.stats.FtSendLag = 0
292+
ft.statsMutex.Unlock()
293+
}
279294
}
280295
return se
281296
}
@@ -314,7 +329,7 @@ func (ft *FtSender) Send(datas []Data) error {
314329
}
315330

316331
if ft.isBlock {
317-
log.Error("Runner[%v] Sender[%v] try Send Datas err: %v", ft.runnerName, ft.innerSender.Name(), err)
332+
log.Errorf("Runner[%v] Sender[%v] try Send Datas err: %v", ft.runnerName, ft.innerSender.Name(), err)
318333
return se
319334
}
320335

@@ -354,9 +369,17 @@ func (ft *FtSender) Send(datas []Data) error {
354369
} else {
355370
// se 中的 lasterror 和 senderror 都为空,需要使用 se.FtQueueLag
356371
se.AddSuccessNum(len(datas))
372+
ft.statsMutex.Lock()
373+
ft.stats.FtSendLag = ft.stats.FtSendLag + int64(len(datas))
374+
ft.statsMutex.Unlock()
357375
ft.backoff.Reset()
358376
}
359377
se.FtQueueLag = ft.BackupQueue.Depth() + ft.logQueue.Depth()
378+
if se.FtQueueLag == 0 {
379+
ft.statsMutex.Lock()
380+
ft.stats.FtSendLag = 0
381+
ft.statsMutex.Unlock()
382+
}
360383
return se
361384
}
362385

@@ -395,6 +418,9 @@ func (ft *FtSender) Close() error {
395418
// persist queue's meta data
396419
ft.logQueue.Close()
397420
ft.BackupQueue.Close()
421+
ft.statsMutex.Lock()
422+
ft.writeLag(ft.stats.FtSendLag)
423+
ft.statsMutex.Unlock()
398424

399425
return ft.innerSender.Close()
400426
}
@@ -481,6 +507,9 @@ func (ft *FtSender) saveToFile(datas []Data) error {
481507
}
482508

483509
func (ft *FtSender) asyncSendLogFromQueue() {
510+
// if not sleep, queue lag may be cleared
511+
time.Sleep(time.Second * 10)
512+
484513
for i := 0; i < ft.procs; i++ {
485514
if ft.opt.sendRaw {
486515
readLinesChan := make(<-chan []string)
@@ -506,18 +535,32 @@ func (ft *FtSender) asyncSendLogFromQueue() {
506535
}
507536

508537
// trySend 从bytes反序列化数据后尝试发送数据
509-
func (ft *FtSender) trySendBytes(dat []byte, failSleep int, isRetry bool) (backDataContext []*datasContext, err error) {
538+
func (ft *FtSender) trySendBytes(dat []byte, failSleep int, isRetry bool, isFromQueue bool) (backDataContext []*datasContext, err error) {
510539
if ft.opt.sendRaw {
511540
datas, err := ft.unmarshalRaws(dat)
512541
if err != nil {
513542
return nil, errors.New(KeyUnMarshalError + ":" + err.Error())
514543
}
544+
ft.statsMutex.Lock()
545+
ft.stats.FtSendLag = ft.stats.FtSendLag - int64(len(datas))
546+
if ft.stats.FtSendLag < 0 {
547+
ft.stats.FtSendLag = 0
548+
}
549+
ft.statsMutex.Unlock()
550+
515551
return ft.backOffSendRawFromQueue(datas, failSleep, isRetry)
516552
}
517553
datas, err := ft.unmarshalData(dat)
518554
if err != nil {
519555
return nil, errors.New(KeyUnMarshalError + ":" + err.Error())
556+
557+
}
558+
ft.statsMutex.Lock()
559+
ft.stats.FtSendLag = ft.stats.FtSendLag - int64(len(datas))
560+
if ft.stats.FtSendLag < 0 {
561+
ft.stats.FtSendLag = 0
520562
}
563+
ft.statsMutex.Unlock()
521564

522565
return ft.backOffSendFromQueue(datas, failSleep, isRetry)
523566
}
@@ -566,6 +609,9 @@ func (ft *FtSender) trySendRaws(datas []string, failSleep int, isRetry bool) (ba
566609
log.Errorf("Runner[%v] Sender[%v] cannot write points back to queue %v: %v, discard datas %d", ft.runnerName, ft.innerSender.Name(), ft.BackupQueue.Name(), err, len(datas))
567610
return nil, nil
568611
}
612+
ft.statsMutex.Lock()
613+
ft.stats.FtSendLag += int64(len(v.Lines))
614+
ft.statsMutex.Unlock()
569615
}
570616

571617
time.Sleep(time.Second * time.Duration(math.Pow(2, float64(failSleep))))
@@ -620,6 +666,9 @@ func (ft *FtSender) trySendDatas(datas []Data, failSleep int, isRetry bool) (bac
620666
log.Errorf("Runner[%v] Sender[%v] cannot write points back to queue %v: %v, discard datas %d", ft.runnerName, ft.innerSender.Name(), ft.BackupQueue.Name(), err, len(datas))
621667
return nil, nil
622668
}
669+
ft.statsMutex.Lock()
670+
ft.stats.FtSendLag += int64(len(v.Datas))
671+
ft.statsMutex.Unlock()
623672
}
624673

625674
time.Sleep(time.Second * time.Duration(math.Pow(2, float64(failSleep))))
@@ -896,8 +945,14 @@ func (ft *FtSender) sendRawFromQueue(queueName string, readChan <-chan []byte, r
896945
} else {
897946
select {
898947
case bytes := <-readChan:
899-
backDataContext, err = ft.trySendBytes(bytes, numWaits, isRetry)
948+
backDataContext, err = ft.trySendBytes(bytes, numWaits, isRetry, true)
900949
case datas := <-readDatasChan:
950+
ft.statsMutex.Lock()
951+
ft.stats.FtSendLag = ft.stats.FtSendLag - int64(len(datas))
952+
if ft.stats.FtSendLag < 0 {
953+
ft.stats.FtSendLag = 0
954+
}
955+
ft.statsMutex.Unlock()
901956
backDataContext, err = ft.backOffSendRawFromQueue(datas, numWaits, isRetry)
902957
case <-timer.C:
903958
continue
@@ -917,7 +972,7 @@ func (ft *FtSender) sendRawFromQueue(queueName string, readChan <-chan []byte, r
917972
unmarshalDataError++
918973
if unmarshalDataError > NumUnMarshalError {
919974
time.Sleep(time.Second)
920-
log.Errorf("Runner[%s] Sender[%s] sleep 1s due to unmarshal err", ft.runnerName, ft.innerSender.Name(), queueName, err)
975+
log.Errorf("Runner[%s] Sender[%s] queue[%s] sleep 1s due to unmarshal err %v", ft.runnerName, ft.innerSender.Name(), queueName, err)
921976
}
922977
} else {
923978
unmarshalDataError = 0
@@ -939,7 +994,6 @@ func (ft *FtSender) sendFromQueue(queueName string, readChan <-chan []byte, read
939994
defer timer.Stop()
940995
numWaits := 1
941996
unmarshalDataError := 0
942-
943997
var curDataContext, otherDataContext []*datasContext
944998
var curIdx int
945999
var backDataContext []*datasContext
@@ -955,8 +1009,14 @@ func (ft *FtSender) sendFromQueue(queueName string, readChan <-chan []byte, read
9551009
} else {
9561010
select {
9571011
case bytes := <-readChan:
958-
backDataContext, err = ft.trySendBytes(bytes, numWaits, isRetry)
1012+
backDataContext, err = ft.trySendBytes(bytes, numWaits, isRetry, true)
9591013
case datas := <-readDatasChan:
1014+
ft.statsMutex.Lock()
1015+
ft.stats.FtSendLag = ft.stats.FtSendLag - int64(len(datas))
1016+
if ft.stats.FtSendLag < 0 {
1017+
ft.stats.FtSendLag = 0
1018+
}
1019+
ft.statsMutex.Unlock()
9601020
backDataContext, err = ft.backOffSendFromQueue(datas, numWaits, isRetry)
9611021
case <-timer.C:
9621022
continue
@@ -976,7 +1036,7 @@ func (ft *FtSender) sendFromQueue(queueName string, readChan <-chan []byte, read
9761036
unmarshalDataError++
9771037
if unmarshalDataError > NumUnMarshalError {
9781038
time.Sleep(time.Second)
979-
log.Errorf("Runner[%s] Sender[%s] sleep 1s due to unmarshal err", ft.runnerName, ft.innerSender.Name(), queueName, err)
1039+
log.Errorf("Runner[%s] Sender[%s] queue[%s] sleep 1s due to unmarshal err %v", ft.runnerName, ft.innerSender.Name(), queueName, err)
9801040
}
9811041
} else {
9821042
unmarshalDataError = 0
@@ -1225,3 +1285,34 @@ func (ft *FtSender) backOffReTrySendRaw(lines []string, isRetry bool) (res []*da
12251285
time.Sleep(backoff.Duration())
12261286
}
12271287
}
1288+
1289+
// readLag read lag from file
1290+
func (ft *FtSender) readLag() int64 {
1291+
path := filepath.Join(ft.opt.saveLogPath, LagFilename)
1292+
f, err := ioutil.ReadFile(path)
1293+
if err != nil {
1294+
log.Errorf("Runner[%v] Sender[%v] read file error : %v", ft.runnerName, ft.innerSender.Name(), err)
1295+
return 0
1296+
}
1297+
lag, err := strconv.ParseInt(string(f), 10, 64)
1298+
if err != nil {
1299+
log.Errorf("Runner[%v] Sender[%v] parse lag error : %v", ft.runnerName, ft.innerSender.Name(), err)
1300+
}
1301+
return lag
1302+
}
1303+
1304+
// writeLag write lag into file
1305+
func (ft *FtSender) writeLag(lag int64) error {
1306+
path := filepath.Join(ft.opt.saveLogPath, LagFilename)
1307+
file, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666)
1308+
defer func() {
1309+
file.Sync()
1310+
file.Close()
1311+
}()
1312+
if err != nil {
1313+
return err
1314+
}
1315+
lagStr := strconv.FormatInt(lag, 10)
1316+
_, err = file.WriteString(lagStr)
1317+
return err
1318+
}

utils/models/models.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ type LagInfo struct {
186186
Size int64 `json:"size"`
187187
SizeUnit string `json:"sizeunit"`
188188
Ftlags int64 `json:"ftlags"`
189+
FtSendLags int64 `json:"ft_send_lags"`
189190
Total int64 `json:"total"`
190191
}
191192

@@ -205,6 +206,7 @@ type StatsInfo struct {
205206
Trend string `json:"trend"`
206207
LastError string `json:"last_error"`
207208
FtQueueLag int64 `json:"-"`
209+
FtSendLag int64 `json:"ft_send_lag"`
208210
}
209211

210212
type ErrorStatistic struct {

0 commit comments

Comments
 (0)