Update spider.py

pull/586/head
FongMi 2 years ago
parent d621e858a4
commit 0828653324
  1. 36
      chaquo/src/main/python/base/spider.py

@ -69,44 +69,26 @@ class Spider(metaclass=ABCMeta):
def getDependence(self):
return []
def regStr(self, src, reg, group=1):
m = re.search(reg, src)
src = ''
if m:
src = m.group(group)
return src
def loadModule(self, name, path):
return SourceFileLoader(name, path).load_module()
def str2json(self, str):
return json.loads(str)
def removeHtmlTags(self, src):
clean = re.compile('<.*?>')
return re.sub(clean, '', src)
def cleanText(self, src):
clean = re.sub('[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]', '', src)
return clean
def fetch(self, url, cookies=None, headers=None, timeout=5, verify=True, stream=False, allow_redirects = True):
rsp = requests.get(url, cookies=cookies, headers=headers, timeout=timeout, verify=verify, stream=stream, allow_redirects=allow_redirects)
rsp.encoding = 'utf-8'
return rsp
def post(self, url, data, cookies=None, headers=None, timeout=5, verify=True, stream=False, allow_redirects = True):
rsp = requests.post(url, data=data, cookies=cookies, headers=headers, timeout=timeout, verify=verify, stream=stream, allow_redirects=allow_redirects)
def fetch(self, url, params=None, cookies=None, headers=None, timeout=5, verify=True, stream=False, allow_redirects = True):
rsp = requests.get(url, params=params, cookies=cookies, headers=headers, timeout=timeout, verify=verify, stream=stream, allow_redirects=allow_redirects)
rsp.encoding = 'utf-8'
return rsp
def postJson(self, url, json, cookies=None, headers=None, timeout=5, verify=True, stream=False, allow_redirects = True):
rsp = requests.post(url, json=json, cookies=cookies, headers=headers, timeout=timeout, verify=verify, stream=stream, allow_redirects=allow_redirects)
def post(self, url, params=None, data=None, json=None, cookies=None, headers=None, timeout=5, verify=True, stream=False, allow_redirects = True):
rsp = requests.post(url, params=params, data=data, json=json, cookies=cookies, headers=headers, timeout=timeout, verify=verify, stream=stream, allow_redirects=allow_redirects)
rsp.encoding = 'utf-8'
return rsp
def html(self, content):
return etree.HTML(content)
def xpText(self, root, expr):
ele = root.xpath(expr)
if len(ele) == 0:
return ''
else:
return ele[0]
def loadModule(self, name, path):
return SourceFileLoader(name, path).load_module()

Loading…
Cancel
Save