bash-tools/checkurl at master · stuart-knock/bash-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env bash

#USAGE: checkurl 'http://url-to-check.org'
#
#EXAMPLE: checkurl 'https://xkcd.com/1319/'
#
#REQUIRES:
#  bashlogging
#  curl
#
#DESCRIPTION:
#  Check if a web page is accessible. Useful as a utility function for
#  identifying dead links. Exit status is 0 if the web page is accessible.
#  If the page is inaccessible the script prints a warning message before
#  exiting with status 1.
#
# Author: Stuart A. Knock
# Originally Written: 2015-11-28
# https://github.com/stuart-knock/bash-tools
#

#TODO: make it capable of handling multiple urls in an efficient way.
#TODO: probably won't call this at a command line very often, so,
#  maybe make it just a function checkurl() that's importable into
#  other scripts with a 'source urlchecker'

#A URL you know is good as a proxy for web access, by default it's the Goog.
knowngoodurl='http://www.google.com'

#Time to wait before we decide the internet or a webpage isn't reachable.
#You might want to make this shorter if called as part of a script, particularly
#if it is called multiple times. Default: 2
timetowait=2

#Import some simple logging functionality
source 'bashlogging'
LOGFILE='linkchecker.log'
LOGLEVEL=1 #Default to just showing warning and error messages.
#-----------------------------------------------------------------------------#

#Fail early, fail often:
(( $# != 1 )) && { err "  Usage: $0 'http://url-to-check.org'"; exit 1; }

#Use a known-to-be-good URL as a proxy for a functional interweb connection.
debug "Checking access to $knowngoodurl as a proxy for web access."
curl --connect-timeout $timetowait --output /dev/null --silent --head --fail "$knowngoodurl"
stateoftheweb=$?

#If the interwebs are working, check for the actual page
if [[ "$stateoftheweb" = "0" ]]; then
  debug "The web is alive... it's alive I tell you."
  url="$1"
  inform "Checking for accessibility of $url"
  #Request just the HEAD of the webpage, throw away output, and exit with a failure state.
  curl --output /dev/null --silent --head --fail "$url"
  headaccess=$?
  debug "headaccess=$headaccess"
  #Some servers, apparently, deny HEAD requests, so we'll double check to be safe
  [[ "$headaccess" != "0" ]] && curl --output /dev/null --silent --fail -r 0-0 "$url" ; firstbyteaccess=$?
  debug "firstbyteaccess=$firstbyteaccess"

  #If either method was successful,
  if [[ "$headaccess" = "0" || "$firstbyteaccess" = "0" ]]; then
    inform "URL is ACCESIBLE: $url"
    exit 0
  else #otherwise the page is, at least currently, inaccessible.
    warn "URL is INACCESIBLE: $url"
    exit 1
  fi
else
  err "The interwebs are broken..."
  exit 1
fi