Робот для работы с виртуальным браузером Splash. Требует развернутый инстанс Splash (проще всего через docker). http://blindage.org/?p=9012
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

106 rader
3.0 KiB

  1. '''
  2. This program checks urls for specific requests. Requires Splash server https://github.com/scrapinghub/splash
  3. Code by Vladimir Smagin, 2018
  4. Mail: 21h@blindage.org
  5. '''
  6. import requests
  7. import re
  8. import mailer
  9. from urllib.parse import quote
  10. import json
  11. import config
  12. import sys, os, signal
  13. import datetime
  14. class C:
  15. HEADER = '\033[95m'
  16. OKBLUE = '\033[94m'
  17. OKGREEN = '\033[92m'
  18. WARNING = '\033[93m'
  19. FAIL = '\033[91m'
  20. ENDC = '\033[0m'
  21. BOLD = '\033[1m'
  22. UNDERLINE = '\033[4m'
  23. def sendmail(a, b):
  24. pass
  25. def sendmail2(subj, body):
  26. message = mailer.Message(From='robot@blindage.org',
  27. To=config.mailaddr)
  28. message.Subject = subj
  29. message.Html = """<p>Urgent report message<br>
  30. %s""" % body
  31. sender = mailer.Mailer('localhost')
  32. try:
  33. sender.send(message)
  34. except:
  35. print(C.FAIL+"Can't send message to sysadmin!"+C.ENDC)
  36. if (len(sys.argv)>1) and (sys.argv[1] == '-html'):
  37. useHTMLTags = True
  38. else:
  39. useHTMLTags = False
  40. searchUrls = config.searchForRequests['urls']
  41. searchRegexps = config.searchForRequests['regexps']
  42. if useHTMLTags:
  43. print('<h1>Request checker report</h1>', 'Started', datetime.datetime.now())
  44. print("<table width='100%' border='1'>")
  45. else:
  46. print(datetime.datetime.now())
  47. for searchUrl in searchUrls:
  48. if useHTMLTags:
  49. print('<tr>')
  50. #очистить кеш, проверить состояние.
  51. try:
  52. r = requests.get(config.searchServer + "/_ping").text
  53. #print("\t",r)
  54. r = requests.post(config.searchServer + "/_gc").text
  55. #print("\t",r)
  56. except:
  57. print(C.FAIL + "Splash server is down. It's time to panic!" + C.ENDC)
  58. sys.exit(1)
  59. try:
  60. if useHTMLTags:
  61. print('<td>', searchUrl, '</td>')
  62. else:
  63. print(C.HEADER+"Checking URL", searchUrl+C.ENDC)
  64. pageUrlRequest = config.searchServer+"/render.har?url="+ quote(searchUrl)# + "&wait=30"
  65. r = requests.get(pageUrlRequest)
  66. except:
  67. print(C.WARNING + "Splash server connection error" + C.ENDC)
  68. sys.exit(1)
  69. if int(r.status_code) is 200:
  70. answer = r.json()
  71. htmlString = ''
  72. for requestUrl in answer['log']['entries']:
  73. url = requestUrl['response']['url']
  74. for searchRegexp in searchRegexps:
  75. if (re.search(searchRegexp, url)):
  76. if useHTMLTags:
  77. htmlString += '<strong>'+searchRegexp+'</strong><br>'+url+'<br>'
  78. else:
  79. print(C.OKGREEN + "Found request match", "\n\t", C.OKBLUE + url,
  80. "\n\t", searchRegexp + C.ENDC)
  81. else:
  82. if useHTMLTags:
  83. htmlString = '<font color=red>'+r.text+'</font>'
  84. else:
  85. print(C.WARNING + "Response code is not 200, got " + str(r.status_code) + C.ENDC)
  86. if useHTMLTags:
  87. print('<td>', htmlString,"</td></tr>")
  88. sys.stdout.flush()
  89. if useHTMLTags:
  90. print("</table>")
  91. print('Finished:', datetime.datetime.now())