Робот для работы с виртуальным браузером Splash. Требует развернутый инстанс Splash (проще всего через docker). http://blindage.org/?p=9012
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.
 
 

106 rindas
3.0 KiB

  1. '''
  2. This program checks urls for specific requests. Requires Splash server https://github.com/scrapinghub/splash
  3. Code by Vladimir Smagin, 2018
  4. Mail: 21h@blindage.org
  5. '''
  6. import requests
  7. import re
  8. import mailer
  9. from urllib.parse import quote
  10. import json
  11. import config
  12. import sys, os, signal
  13. import datetime
  14. class C:
  15. HEADER = '\033[95m'
  16. OKBLUE = '\033[94m'
  17. OKGREEN = '\033[92m'
  18. WARNING = '\033[93m'
  19. FAIL = '\033[91m'
  20. ENDC = '\033[0m'
  21. BOLD = '\033[1m'
  22. UNDERLINE = '\033[4m'
  23. def sendmail(a, b):
  24. pass
  25. def sendmail2(subj, body):
  26. message = mailer.Message(From='robot@blindage.org',
  27. To=config.mailaddr)
  28. message.Subject = subj
  29. message.Html = """<p>Urgent report message<br>
  30. %s""" % body
  31. sender = mailer.Mailer('localhost')
  32. try:
  33. sender.send(message)
  34. except:
  35. print(C.FAIL+"Can't send message to sysadmin!"+C.ENDC)
  36. if (len(sys.argv)>1) and (sys.argv[1] == '-html'):
  37. useHTMLTags = True
  38. else:
  39. useHTMLTags = False
  40. searchUrls = config.searchForRequests['urls']
  41. searchRegexps = config.searchForRequests['regexps']
  42. if useHTMLTags:
  43. print('<h1>Request checker report</h1>', 'Started', datetime.datetime.now())
  44. print("<table width='100%' border='1'>")
  45. else:
  46. print(datetime.datetime.now())
  47. for searchUrl in searchUrls:
  48. if useHTMLTags:
  49. print('<tr>')
  50. #очистить кеш, проверить состояние.
  51. try:
  52. r = requests.get(config.searchServer + "/_ping").text
  53. #print("\t",r)
  54. r = requests.post(config.searchServer + "/_gc").text
  55. #print("\t",r)
  56. except:
  57. print(C.FAIL + "Splash server is down. It's time to panic!" + C.ENDC)
  58. sys.exit(1)
  59. try:
  60. if useHTMLTags:
  61. print('<td>', searchUrl, '</td>')
  62. else:
  63. print(C.HEADER+"Checking URL", searchUrl+C.ENDC)
  64. pageUrlRequest = config.searchServer+"/render.har?url="+ quote(searchUrl)# + "&wait=30"
  65. r = requests.get(pageUrlRequest)
  66. except:
  67. print(C.WARNING + "Splash server connection error" + C.ENDC)
  68. sys.exit(1)
  69. if int(r.status_code) is 200:
  70. answer = r.json()
  71. htmlString = ''
  72. for requestUrl in answer['log']['entries']:
  73. url = requestUrl['response']['url']
  74. for searchRegexp in searchRegexps:
  75. if (re.search(searchRegexp, url)):
  76. if useHTMLTags:
  77. htmlString += '<strong>'+searchRegexp+'</strong><br>'+url+'<br>'
  78. else:
  79. print(C.OKGREEN + "Found request match", "\n\t", C.OKBLUE + url,
  80. "\n\t", searchRegexp + C.ENDC)
  81. else:
  82. if useHTMLTags:
  83. htmlString = '<font color=red>'+r.text+'</font>'
  84. else:
  85. print(C.WARNING + "Response code is not 200, got " + str(r.status_code) + C.ENDC)
  86. if useHTMLTags:
  87. print('<td>', htmlString,"</td></tr>")
  88. sys.stdout.flush()
  89. if useHTMLTags:
  90. print("</table>")
  91. print('Finished:', datetime.datetime.now())