Compare commits

..

2 Commits
v0.2.4 ... main

3 changed files with 50 additions and 27 deletions

View File

@ -40,3 +40,9 @@ Sending POST on /reload :
curl -XPOST http://my-nodegopher-host:8080/reload curl -XPOST http://my-nodegopher-host:8080/reload
{"message":"configuration successfully reloaded"} {"message":"configuration successfully reloaded"}
``` ```
Reloading a badly formated configuration will produce an error and keep the old configuration running.
```
% curl -XPOST 127.1:8080/reload
{"error":"Unable to load new configuration, keeping old one. See logs."}
```

View File

@ -1,4 +1,4 @@
# Formatting metrics in main & secondarystat. Supported: "english", "french", "german", "ukrainian", "chinese", "arabic" # Formatting metrics in main & secondarystat. Supported: "english", "french", "german", "ukrainian", "chinese", "arabic". Default is english.
language: 'english' language: 'english'
# datasource describe a way to get prometheus metrics. # datasource describe a way to get prometheus metrics.
@ -7,51 +7,44 @@ language: 'english'
# - address: the address of prometheus. # - address: the address of prometheus.
# - query: prometheus query. Same as typed in prometheus graph page. # - query: prometheus query. Same as typed in prometheus graph page.
# - type: type of query. "query" will get instant value, "query_range" will get all samples for the grafana period. Result will be averaged. # - type: type of query. "query" will get instant value, "query_range" will get all samples for the grafana period. Result will be averaged.
# - timeout: query timeout in seconds. # - timeout: query timeout in seconds. default is 10.
datasources: datasources:
- name: prom_samples_per_sec - name: prom_samples_per_sec
type: query type: query
address: 'http://prometheus.local.lan:9090' address: 'http://prometheus.local.lan:9090'
query: 'rate(prometheus_tsdb_head_samples_appended_total{type="float"}[10m])' query: 'rate(prometheus_tsdb_head_samples_appended_total{type="float"}[10m])'
timeout: 10 timeout: 15
- name: node_cpu_metric - name: node_cpu_metric
# Simple query, return an instant metric # Simple query, return an instant metric
type: query type: query
address: 'http://prometheus.local.lan:9090' address: 'http://prometheus.local.lan:9090'
query: 'sum(rate(node_cpu_seconds_total{instance="router01.local.lan:9100",job="node",mode!~"idle"}[30s]))*100' query: 'sum(rate(node_cpu_seconds_total{instance="router01.local.lan:9100",job="node",mode!~"idle"}[30s]))*100'
timeout: 10
- name: node_cpu_metric_over_80 - name: node_cpu_metric_over_80
type: query type: query
address: 'http://prometheus.local.lan:9090' address: 'http://prometheus.local.lan:9090'
# Return 1 if cpu rate > 80% # Return 1 if cpu rate > 80%
query: '(sum(rate(node_cpu_seconds_total{instance="router01.local.lan:9100",job="node",mode!~"idle"}[30s]))*100) > bool 80' query: '(sum(rate(node_cpu_seconds_total{instance="router01.local.lan:9100",job="node",mode!~"idle"}[30s]))*100) > bool 80'
timeout: 10
- name: router01_net_down_rate - name: router01_net_down_rate
# Range query. Return all metrics from a time range. Result will be averaged from these metrics. Time range will be provided by Grafana. # Range query. Return all metrics from a time range. Result will be averaged from these metrics. Time range will be provided by Grafana.
type: query_range type: query_range
address: 'http://prometheus.local.lan:9090' address: 'http://prometheus.local.lan:9090'
query: 'rate(node_network_receive_bytes_total{device="igb0", instance="router01.local.lan:9100", job="node"}[30s])' query: 'rate(node_network_receive_bytes_total{device="igb0", instance="router01.local.lan:9100", job="node"}[30s])'
timeout: 10
- name: router01_net_up_rate - name: router01_net_up_rate
type: query_range type: query_range
address: 'http://prometheus.local.lan:9090' address: 'http://prometheus.local.lan:9090'
query: 'rate(node_network_transmit_bytes_total{device="igb0", instance="router01.local.lan:9100", job="node"}[30s])' query: 'rate(node_network_transmit_bytes_total{device="igb0", instance="router01.local.lan:9100", job="node"}[30s])'
timeout: 10
- name: router01_lan_down_rate - name: router01_lan_down_rate
type: query_range type: query_range
address: 'http://prometheus.local.lan:9090' address: 'http://prometheus.local.lan:9090'
query: 'rate(node_network_receive_bytes_total{device="ix3", instance="router01.local.lan:9100", job="node"}[30s])' query: 'rate(node_network_receive_bytes_total{device="ix3", instance="router01.local.lan:9100", job="node"}[30s])'
timeout: 10
- name: router01_lan_up_rate - name: router01_lan_up_rate
type: query_range type: query_range
address: 'http://prometheus.local.lan:9090' address: 'http://prometheus.local.lan:9090'
query: 'rate(node_network_transmit_bytes_total{device="ix3", instance="router01.local.lan:9100", job="node"}[30s])' query: 'rate(node_network_transmit_bytes_total{device="ix3", instance="router01.local.lan:9100", job="node"}[30s])'
timeout: 10
- name: router01_net_down_rate_perten - name: router01_net_down_rate_perten
type: query type: query
address: 'http://prometheus.local.lan:9090' address: 'http://prometheus.local.lan:9090'
query: 'rate(node_network_receive_bytes_total{device="igb0", instance="router01.local.lan:9100", job="node"}[30s])/62500000*10' query: 'rate(node_network_receive_bytes_total{device="igb0", instance="router01.local.lan:9100", job="node"}[30s])/62500000*10'
timeout: 10
# graphs identifies context for a nodegraph. You can have many contexts, and your grafana query will mention this context name. # graphs identifies context for a nodegraph. You can have many contexts, and your grafana query will mention this context name.
# For this example named "internet", grafana URL will be : # For this example named "internet", grafana URL will be :

58
main.go
View File

@ -34,7 +34,7 @@ import (
) )
const ( const (
gVersion = "0.2.4" gVersion = "0.2.5"
// Default datasource timeout is 10 seconds // Default datasource timeout is 10 seconds
gDefaultDSTimeout = 10 gDefaultDSTimeout = 10
) )
@ -329,7 +329,7 @@ func getGraph(name string) (Graph, error) {
return Graph{}, fmt.Errorf("Graph not found: %s", name) return Graph{}, fmt.Errorf("Graph not found: %s", name)
} }
func initRoutes(r *gin.Engine) { func initRoutes(r *gin.Engine, confFile string) {
r.GET("/ping", func(c *gin.Context) { r.GET("/ping", func(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{ c.JSON(http.StatusOK, gin.H{
"message": "pong", "message": "pong",
@ -338,7 +338,10 @@ func initRoutes(r *gin.Engine) {
// An endpoint to force read of configuration file // An endpoint to force read of configuration file
r.POST("/reload", func(c *gin.Context) { r.POST("/reload", func(c *gin.Context) {
reloadConfigFile() if err := reloadConfigFile(confFile); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
c.JSON(http.StatusOK, gin.H{ c.JSON(http.StatusOK, gin.H{
"message": "configuration successfully reloaded", "message": "configuration successfully reloaded",
}) })
@ -469,16 +472,32 @@ func newEdgeClone(src *Edge) *Edge {
} }
} }
func reloadConfigFile() { // This function assume we already have a running configuration.
// First reread config file func reloadConfigFile(confFile string) error {
if err := viper.ReadInConfig(); err != nil { oldConfigRestored := false
if _, ok := err.(viper.ConfigFileNotFoundError); ok { // We need to keep this config, incase the new one is b0rken
log.Fatalf("config file not found") fname := fmt.Sprintf("/tmp/nodegopher.%d.yaml", os.Getpid())
os.Exit(1) if err := viper.WriteConfigAs(fname); err != nil {
log.Errorf("Unable to save current running config to %s, wont reload configuration.\n", fname)
return fmt.Errorf("Unable to save current configuration, configuration not reloaded. See logs.")
}
defer os.Remove(fname)
// Reread config file
if oldErr := viper.ReadInConfig(); oldErr != nil {
if _, ok := oldErr.(viper.ConfigFileNotFoundError); ok {
log.Errorf("config file not found")
} else { } else {
log.Fatalf("unknown error looking for config file: %v", err) log.Errorf("unknown error looking for config file: %v", oldErr)
os.Exit(1)
} }
// Restore old configuration and notify.
log.Debugf("Fallback on previous configuration.\n")
viper.SetConfigFile(fname)
if err := viper.ReadInConfig(); err != nil {
log.Fatalf("Unable to restore configuration, and new is invalid. fix it now.\n")
}
viper.SetConfigFile(confFile)
oldConfigRestored = true
} }
switch viper.Get("language").(string) { switch viper.Get("language").(string) {
@ -503,6 +522,7 @@ func reloadConfigFile() {
gCfgMutex.Lock() gCfgMutex.Lock()
defer gCfgMutex.Unlock() defer gCfgMutex.Unlock()
// We need to keep this config, incase the new one is b0rken
for _, g := range gGraphs { for _, g := range gGraphs {
g.Nodes = nil g.Nodes = nil
g.Edges = nil g.Edges = nil
@ -538,8 +558,8 @@ func reloadConfigFile() {
} }
if viper.Get("datasources") == nil { if viper.Get("datasources") == nil {
log.Printf("no datasources found, data will be static") log.Warningf("no datasources found, data will be static")
return return nil
} }
dss := viper.Get("datasources").([]interface{}) dss := viper.Get("datasources").([]interface{})
for _, d := range dss { for _, d := range dss {
@ -552,6 +572,10 @@ func reloadConfigFile() {
} }
gDataSources = append(gDataSources, ds) gDataSources = append(gDataSources, ds)
} }
if oldConfigRestored {
return fmt.Errorf("Unable to load new configuration, keeping old one. See logs.")
}
return nil
} }
func main() { func main() {
@ -604,7 +628,7 @@ func main() {
// FIXME: Watch config changes. Does not work on FreeBSD. TODO: Test with linux // FIXME: Watch config changes. Does not work on FreeBSD. TODO: Test with linux
viper.OnConfigChange(func(e fsnotify.Event) { viper.OnConfigChange(func(e fsnotify.Event) {
log.Printf("Config file changed, reloading data\n") log.Printf("Config file changed, reloading data\n")
reloadConfigFile() reloadConfigFile(confFile)
}) })
// Lets reload config on SIGHUP // Lets reload config on SIGHUP
@ -614,11 +638,11 @@ func main() {
for { for {
_ = <- sigs _ = <- sigs
log.Infof("SIGHUP received, reloading configuration\n") log.Infof("SIGHUP received, reloading configuration\n")
reloadConfigFile() reloadConfigFile(confFile)
} }
}() }()
reloadConfigFile() reloadConfigFile(confFile)
// Capture variable name. There should be only one variable. Space is tolerated before and after name. // Capture variable name. There should be only one variable. Space is tolerated before and after name.
gDSVarCompRegex = regexp.MustCompile(`^\{\{(?:\ )?([a-zA-Z0-9\-_]+)(?:\ )?\}\}$`) gDSVarCompRegex = regexp.MustCompile(`^\{\{(?:\ )?([a-zA-Z0-9\-_]+)(?:\ )?\}\}$`)
@ -629,6 +653,6 @@ func main() {
log.Printf("Starting NodeGopher v.%s\n", gVersion) log.Printf("Starting NodeGopher v.%s\n", gVersion)
r := gin.Default() r := gin.Default()
initRoutes(r) initRoutes(r, confFile)
r.Run(listen) r.Run(listen)
} }