diff --git a/chaquo/src/main/python/base/spider.py b/chaquo/src/main/python/base/spider.py index 75e905a95..ec61e06b7 100644 --- a/chaquo/src/main/python/base/spider.py +++ b/chaquo/src/main/python/base/spider.py @@ -69,44 +69,26 @@ class Spider(metaclass=ABCMeta): def getDependence(self): return [] - def regStr(self, src, reg, group=1): - m = re.search(reg, src) - src = '' - if m: - src = m.group(group) - return src + def loadModule(self, name, path): + return SourceFileLoader(name, path).load_module() - def str2json(self, str): - return json.loads(str) + def removeHtmlTags(self, src): + clean = re.compile('<.*?>') + return re.sub(clean, '', src) def cleanText(self, src): clean = re.sub('[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]', '', src) return clean - def fetch(self, url, cookies=None, headers=None, timeout=5, verify=True, stream=False, allow_redirects = True): - rsp = requests.get(url, cookies=cookies, headers=headers, timeout=timeout, verify=verify, stream=stream, allow_redirects=allow_redirects) - rsp.encoding = 'utf-8' - return rsp - - def post(self, url, data, cookies=None, headers=None, timeout=5, verify=True, stream=False, allow_redirects = True): - rsp = requests.post(url, data=data, cookies=cookies, headers=headers, timeout=timeout, verify=verify, stream=stream, allow_redirects=allow_redirects) + def fetch(self, url, params=None, cookies=None, headers=None, timeout=5, verify=True, stream=False, allow_redirects = True): + rsp = requests.get(url, params=params, cookies=cookies, headers=headers, timeout=timeout, verify=verify, stream=stream, allow_redirects=allow_redirects) rsp.encoding = 'utf-8' return rsp - def postJson(self, url, json, cookies=None, headers=None, timeout=5, verify=True, stream=False, allow_redirects = True): - rsp = requests.post(url, json=json, cookies=cookies, headers=headers, timeout=timeout, verify=verify, stream=stream, allow_redirects=allow_redirects) + def post(self, url, params=None, data=None, json=None, cookies=None, headers=None, timeout=5, verify=True, stream=False, allow_redirects = True): + rsp = requests.post(url, params=params, data=data, json=json, cookies=cookies, headers=headers, timeout=timeout, verify=verify, stream=stream, allow_redirects=allow_redirects) rsp.encoding = 'utf-8' return rsp def html(self, content): return etree.HTML(content) - - def xpText(self, root, expr): - ele = root.xpath(expr) - if len(ele) == 0: - return '' - else: - return ele[0] - - def loadModule(self, name, path): - return SourceFileLoader(name, path).load_module()