False positives after the Parent host recovered

ling Zhang lzhang at lbl.gov
Thu May 20 20:04:27 CEST 2004


Hi,
 
I hope to get your input on a frustrating problem.  Right after a
"parent" host goes down and recovers, I receive a burst of notifications
indicating that downstream "children" have gone down & recovered, even
though that's not the case.  Although this behavior doesn't happen every
time a "parent" node goes down, my impression is that the odds are
greater than 30%. 
 
For example, suppose this my network:
 
 
Nagios--------Bridge------Parent Switch--------Child switch1
                              |
                              |
                          Child Switch2--------Child switch3
 
The series of events go like this:
 
1. Disconnect link between "Bridge" and "parent switch". 
2. Nagios reports and only reports "parent switch" down. (good) 
3. Re-connect link between "Bridge" and "parent switch". 
4. Nagios reports "parent switch" recovered. (very good)
5. Nagios reports "child switch1" and "child switch2" down right after
"parent switch" recovered. (what the?) 
6. Nagios reports "child switch1" and "child switch2" recovered shortly.
(????????)
 
 
Now, My nagios host configuration for the testing network looks like
this:
 
define host{
        name                                            generic-Bridge
        notifications_enabled                           1
; Host notifications are enabled
        event_handler_enabled                           1
; Host event handler is enabled
        flap_detection_enabled                          0
; Flap detection is enabled
        process_perf_data                               1
; Process performance data
        retain_status_information                       1

        retain_nonstatus_information                    1

        check_command                                   check-host-alive
        max_check_attempts                              3
        notification_interval                           0
        notification_period                             24x7
        notification_options                            d,r
        register                                        0

        }
 
 
define host{
        name                                            generic-switch
;
        notifications_enabled                           1
; Host notifications are enabled
        event_handler_enabled                           1
; Host event handler is enabled
        flap_detection_enabled                          0
; Flap detection is enabled
        process_perf_data                               1
; Process performance data
        retain_status_information                       1

        retain_nonstatus_information                    1

        check_command                                   check-host-alive
        max_check_attempts                              3
        notification_interval                           0
        notification_period                             24x7
        notification_options                            d,r

        register                                        0

        }
 
 
define host {
        use                     generic-bridge
        host_name               Bridge
        address                 1.1.1.1
}   
 
 
define host {
        use                   generic-switch
        host_name             parent-switch
        address               1.1.1.10
        parents               Bridge
}   
 
define host {
        use                   generic-switch
        host_name             child-switch1
        address               1.1.1.11
        parents               parent-switch
}
 
 
define host {
        use                   generic-switch
        host_name             child-switch2
        address               1.1.1.12
        parents               parent-switch
}
 
 
define host {
        use                   generic-switch
        host_name             child-switch3
        address               1.1.1.13
        parents               child-switch2
}
 
 
So, any idea on this?
 
Thanks.
 
Ling
 
 
 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://www.monitoring-lists.org/archive/users/attachments/20040520/5ea83cc0/attachment.html>


More information about the Users mailing list