diff --git a/app/build.gradle b/app/build.gradle index f25524c..f76533d 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -42,9 +42,9 @@ android { dependencies { implementation 'com.squareup.okhttp3:okhttp:' + okhttpVersion implementation 'com.github.thegrizzlylabs:sardine-android:0.9' - implementation 'wang.harlon.quickjs:wrapper-android:2.4.0' - implementation 'com.google.code.gson:gson:2.11.0' + implementation 'wang.harlon.quickjs:wrapper-android:2.4.3' + implementation 'com.google.code.gson:gson:2.12.1' implementation 'cn.wanghaomiao:JsoupXpath:2.5.1' implementation 'com.orhanobut:logger:2.2.0' - implementation 'org.jsoup:jsoup:1.15.3' + implementation 'org.jsoup:jsoup:1.17.2' } \ No newline at end of file diff --git a/app/proguard-rules.pro b/app/proguard-rules.pro index ddd270a..3131646 100644 --- a/app/proguard-rules.pro +++ b/app/proguard-rules.pro @@ -15,6 +15,7 @@ # Spider -keep class com.github.catvod.crawler.* { *; } -keep class com.github.catvod.spider.* { public ; } +-keep class com.github.catvod.js.Function { *; } # OkHttp -dontwarn okhttp3.** diff --git a/app/src/main/java/com/github/catvod/js/Function.java b/app/src/main/java/com/github/catvod/js/Function.java new file mode 100644 index 0000000..ffb9e33 --- /dev/null +++ b/app/src/main/java/com/github/catvod/js/Function.java @@ -0,0 +1,54 @@ +package com.github.catvod.js; + +import com.github.catvod.js.utils.Parser; +import com.github.catvod.js.utils.JSUtil; +import com.whl.quickjs.wrapper.JSArray; +import com.whl.quickjs.wrapper.JSMethod; +import com.whl.quickjs.wrapper.QuickJSContext; + +import java.lang.reflect.Method; + +public class Function { + + private final QuickJSContext ctx; + private final Parser parser; + + public Function(QuickJSContext ctx) { + this.parser = new Parser(); + this.ctx = ctx; + setProperty(); + } + + private void setProperty() { + for (Method method : getClass().getMethods()) { + if (!method.isAnnotationPresent(JSMethod.class)) continue; + ctx.getGlobalObject().setProperty(method.getName(), args -> { + try { + return method.invoke(this, args); + } catch (Exception e) { + return null; + } + }); + } + } + + @JSMethod + public String pd(String html, String rule, String urlKey) { + return parser.parseDomForUrl(html, rule, urlKey); + } + + @JSMethod + public String pdfh(String html, String rule) { + return parser.parseDomForUrl(html, rule, ""); + } + + @JSMethod + public JSArray pdfa(String html, String rule) { + return JSUtil.toArray(ctx, parser.parseDomForArray(html, rule)); + } + + @JSMethod + public JSArray pdfl(String html, String rule, String texts, String urls, String urlKey) { + return JSUtil.toArray(ctx, parser.parseDomForList(html, rule, texts, urls, urlKey)); + } +} diff --git a/app/src/main/java/com/github/catvod/js/bean/Cache.java b/app/src/main/java/com/github/catvod/js/bean/Cache.java new file mode 100644 index 0000000..cd9d4e7 --- /dev/null +++ b/app/src/main/java/com/github/catvod/js/bean/Cache.java @@ -0,0 +1,32 @@ +package com.github.catvod.js.bean; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +public class Cache { + + private String pdfhHtml; + private String pdfaHtml; + private Document pdfhDoc; + private Document pdfaDoc; + + public Document getPdfh(String html) { + updatePdfh(html); + return pdfhDoc; + } + + public Document getPdfa(String html) { + updatePdfa(html); + return pdfaDoc; + } + + private void updatePdfh(String html) { + if (html.equals(pdfhHtml)) return; + pdfhDoc = Jsoup.parse(pdfhHtml = html); + } + + private void updatePdfa(String html) { + if (html.equals(pdfaHtml)) return; + pdfaDoc = Jsoup.parse(pdfaHtml = html); + } +} diff --git a/app/src/main/java/com/github/catvod/js/bean/Info.java b/app/src/main/java/com/github/catvod/js/bean/Info.java new file mode 100644 index 0000000..9c40c78 --- /dev/null +++ b/app/src/main/java/com/github/catvod/js/bean/Info.java @@ -0,0 +1,42 @@ +package com.github.catvod.js.bean; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class Info { + + public int index; + public String rule; + public List excludes; + + public Info(String rule) { + this.rule = rule; + } + + public void setRule(String rule) { + this.rule = rule; + } + + public void setInfo(String pos) { + if (rule.contains("--")) { + String[] rules = rule.split("--"); + setExcludes(rules); + setRule(rules[0]); + } else if (pos.contains("--")) { + String[] rules = pos.split("--"); + setExcludes(rules); + pos = rules[0]; + } + try { + index = Integer.parseInt(pos.replace("eq(", "").replace(")", "")); + } catch (Exception ignored) { + index = 0; + } + } + + public void setExcludes(String[] rules) { + excludes = new ArrayList<>(Arrays.asList(rules)); + excludes.remove(0); + } +} diff --git a/app/src/main/java/com/github/catvod/js/utils/JSUtil.java b/app/src/main/java/com/github/catvod/js/utils/JSUtil.java new file mode 100644 index 0000000..db547ba --- /dev/null +++ b/app/src/main/java/com/github/catvod/js/utils/JSUtil.java @@ -0,0 +1,16 @@ +package com.github.catvod.js.utils; + +import com.whl.quickjs.wrapper.JSArray; +import com.whl.quickjs.wrapper.QuickJSContext; + +import java.util.List; + +public class JSUtil { + + public static JSArray toArray(QuickJSContext ctx, List items) { + JSArray array = ctx.createNewJSArray(); + if (items == null || items.isEmpty()) return array; + for (int i = 0; i < items.size(); i++) array.set(items.get(i), i); + return array; + } +} diff --git a/app/src/main/java/com/github/catvod/js/utils/Parser.java b/app/src/main/java/com/github/catvod/js/utils/Parser.java new file mode 100644 index 0000000..de5297e --- /dev/null +++ b/app/src/main/java/com/github/catvod/js/utils/Parser.java @@ -0,0 +1,171 @@ +package com.github.catvod.js.utils; + +import android.text.TextUtils; + +import com.github.catvod.js.bean.Cache; +import com.github.catvod.js.bean.Info; +import com.github.catvod.utils.UriUtil; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class Parser { + + private final Pattern URL = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL); + private final Pattern NO_ADD = Pattern.compile(":eq|:lt|:gt|:first|:last|:not|:even|:odd|:has|:contains|:matches|:empty|^body$|^#"); + private final Pattern JOIN_URL = Pattern.compile("(url|src|href|-original|-src|-play|-url|style)$|^(data-|url-|src-)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); + private final Pattern SPEC_URL = Pattern.compile("^(ftp|magnet|thunder|ws):", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); + + private final Cache cache; + + public Parser() { + cache = new Cache(); + } + + private Info getParseInfo(String rule) { + Info info = new Info(rule); + if (rule.contains(":eq")) { + info.setRule(rule.split(":")[0]); + info.setInfo(rule.split(":")[1]); + } else if (rule.contains("--")) { + String[] rules = rule.split("--"); + info.setExcludes(rules); + info.setRule(rules[0]); + } + return info; + } + + private String parseHikerToJq(String parse, boolean first) { + if (!parse.contains("&&")) { + String[] split = parse.split(" "); + Matcher m = NO_ADD.matcher(split[split.length - 1]); + if (!m.find() && first) parse = parse + ":eq(0)"; + return parse; + } + String[] parses = parse.split("&&"); + List items = new ArrayList<>(); + for (int i = 0; i < parses.length; i++) { + String[] split = parses[i].split(" "); + if (NO_ADD.matcher(split[split.length - 1]).find()) { + items.add(parses[i]); + } else { + if (!first && i >= parses.length - 1) items.add(parses[i]); + else items.add(parses[i] + ":eq(0)"); + } + } + return TextUtils.join(" ", items); + } + + public String parseDomForUrl(String html, String rule, String addUrl) { + Document doc = cache.getPdfh(html); + if ("body&&Text".equals(rule) || "Text".equals(rule)) { + return doc.text(); + } else if ("body&&Html".equals(rule) || "Html".equals(rule)) { + return doc.html(); + } + String option = ""; + if (rule.contains("&&")) { + String[] rs = rule.split("&&"); + option = rs[rs.length - 1]; + List excludes = new ArrayList<>(Arrays.asList(rs)); + excludes.remove(rs.length - 1); + rule = TextUtils.join("&&", excludes); + } + rule = parseHikerToJq(rule, true); + String[] parses = rule.split(" "); + Elements elements = new Elements(); + for (String parse : parses) { + elements = parseOneRule(doc, parse, elements); + if (elements.isEmpty()) return ""; + } + if (TextUtils.isEmpty(option)) return elements.outerHtml(); + if ("Text".equals(option)) { + return elements.text(); + } else if ("Html".equals(option)) { + return elements.html(); + } else { + String result = ""; + for (String s : option.split("[||]")) { + result = elements.attr(s); + if (s.toLowerCase().contains("style") && result.contains("url(")) { + Matcher m = URL.matcher(result); + if (m.find()) result = m.group(1); + result = result.replaceAll("^['|\"](.*)['|\"]$", "$1"); + } + if (!result.isEmpty() && !addUrl.isEmpty()) { + if (JOIN_URL.matcher(s).find() && !SPEC_URL.matcher(result).find()) { + if (result.contains("http")) { + result = result.substring(result.indexOf("http")); + } else { + result = UriUtil.resolve(addUrl, result); + } + } + } + if (!result.isEmpty()) { + return result; + } + } + return result; + } + } + + public List parseDomForArray(String html, String rule) { + Document doc = cache.getPdfa(html); + rule = parseHikerToJq(rule, false); + String[] parses = rule.split(" "); + Elements elements = new Elements(); + for (String parse : parses) { + elements = parseOneRule(doc, parse, elements); + if (elements.isEmpty()) return new ArrayList<>(); + } + List items = new ArrayList<>(); + for (Element element : elements) items.add(element.outerHtml()); + return items; + } + + private Elements parseOneRule(Document doc, String parse, Elements elements) { + Info info = getParseInfo(parse); + if (elements.isEmpty()) { + elements = doc.select(info.rule); + } else { + elements = elements.select(info.rule); + } + if (parse.contains(":eq")) { + if (info.index < 0) { + elements = elements.eq(elements.size() + info.index); + } else { + elements = elements.eq(info.index); + } + } + if (info.excludes != null && !elements.isEmpty()) { + elements = elements.clone(); + for (int i = 0; i < info.excludes.size(); i++) { + elements.select(info.excludes.get(i)).remove(); + } + } + return elements; + } + + public List parseDomForList(String html, String rule, String texts, String urls, String urlKey) { + String[] parses = parseHikerToJq(rule, false).split(" "); + Elements elements = new Elements(); + for (String parse : parses) { + elements = parseOneRule(cache.getPdfa(html), parse, elements); + if (elements.isEmpty()) return Collections.emptyList(); + } + List items = new ArrayList<>(); + for (Element element : elements) { + html = element.outerHtml(); + items.add(parseDomForUrl(html, texts, "").trim() + '$' + parseDomForUrl(html, urls, urlKey)); + } + return items; + } +} diff --git a/app/src/main/java/com/github/catvod/utils/UriUtil.java b/app/src/main/java/com/github/catvod/utils/UriUtil.java new file mode 100644 index 0000000..4140e0a --- /dev/null +++ b/app/src/main/java/com/github/catvod/utils/UriUtil.java @@ -0,0 +1,231 @@ +package com.github.catvod.utils; + +import android.text.TextUtils; + +import androidx.annotation.Nullable; + +/** + * Utility methods for manipulating URIs. + */ +public final class UriUtil { + + /** + * The length of arrays returned by {@link #getUriIndices(String)}. + */ + private static final int INDEX_COUNT = 4; + + /** + * An index into an array returned by {@link #getUriIndices(String)}. + * + *

The value at this position in the array is the index of the ':' after the scheme. Equals -1 + * if the URI is a relative reference (no scheme). The hier-part starts at (schemeColon + 1), + * including when the URI has no scheme. + */ + private static final int SCHEME_COLON = 0; + + /** + * An index into an array returned by {@link #getUriIndices(String)}. + * + *

The value at this position in the array is the index of the path part. Equals (schemeColon + + * 1) if no authority part, (schemeColon + 3) if the authority part consists of just "//", and + * (query) if no path part. The characters starting at this index can be "//" only if the + * authority part is non-empty (in this case the double-slash means the first segment is empty). + */ + private static final int PATH = 1; + + /** + * An index into an array returned by {@link #getUriIndices(String)}. + * + *

The value at this position in the array is the index of the query part, including the '?' + * before the query. Equals fragment if no query part, and (fragment - 1) if the query part is a + * single '?' with no data. + */ + private static final int QUERY = 2; + + /** + * An index into an array returned by {@link #getUriIndices(String)}. + * + *

The value at this position in the array is the index of the fragment part, including the '#' + * before the fragment. Equal to the length of the URI if no fragment part, and (length - 1) if + * the fragment part is a single '#' with no data. + */ + private static final int FRAGMENT = 3; + + /** + * Performs relative resolution of a {@code referenceUri} with respect to a {@code baseUri}. + * + *

The resolution is performed as specified by RFC-3986. + * + * @param baseUri The base URI. + * @param referenceUri The reference URI to resolve. + */ + public static String resolve(@Nullable String baseUri, @Nullable String referenceUri) { + StringBuilder uri = new StringBuilder(); + + // Map null onto empty string, to make the following logic simpler. + baseUri = baseUri == null ? "" : baseUri; + referenceUri = referenceUri == null ? "" : referenceUri; + + int[] refIndices = getUriIndices(referenceUri); + if (refIndices[SCHEME_COLON] != -1) { + // The reference is absolute. The target Uri is the reference. + uri.append(referenceUri); + removeDotSegments(uri, refIndices[PATH], refIndices[QUERY]); + return uri.toString(); + } + + int[] baseIndices = getUriIndices(baseUri); + if (refIndices[FRAGMENT] == 0) { + // The reference is empty or contains just the fragment part, then the target Uri is the + // concatenation of the base Uri without its fragment, and the reference. + return uri.append(baseUri, 0, baseIndices[FRAGMENT]).append(referenceUri).toString(); + } + + if (refIndices[QUERY] == 0) { + // The reference starts with the query part. The target is the base up to (but excluding) the + // query, plus the reference. + return uri.append(baseUri, 0, baseIndices[QUERY]).append(referenceUri).toString(); + } + + if (refIndices[PATH] != 0) { + // The reference has authority. The target is the base scheme plus the reference. + int baseLimit = baseIndices[SCHEME_COLON] + 1; + uri.append(baseUri, 0, baseLimit).append(referenceUri); + return removeDotSegments(uri, baseLimit + refIndices[PATH], baseLimit + refIndices[QUERY]); + } + + if (referenceUri.charAt(refIndices[PATH]) == '/') { + // The reference path is rooted. The target is the base scheme and authority (if any), plus + // the reference. + uri.append(baseUri, 0, baseIndices[PATH]).append(referenceUri); + return removeDotSegments(uri, baseIndices[PATH], baseIndices[PATH] + refIndices[QUERY]); + } + + // The target Uri is the concatenation of the base Uri up to (but excluding) the last segment, + // and the reference. This can be split into 2 cases: + if (baseIndices[SCHEME_COLON] + 2 < baseIndices[PATH] && baseIndices[PATH] == baseIndices[QUERY]) { + // Case 1: The base hier-part is just the authority, with an empty path. An additional '/' is + // needed after the authority, before appending the reference. + uri.append(baseUri, 0, baseIndices[PATH]).append('/').append(referenceUri); + return removeDotSegments(uri, baseIndices[PATH], baseIndices[PATH] + refIndices[QUERY] + 1); + } else { + // Case 2: Otherwise, find the last '/' in the base hier-part and append the reference after + // it. If base hier-part has no '/', it could only mean that it is completely empty or + // contains only one segment, in which case the whole hier-part is excluded and the reference + // is appended right after the base scheme colon without an added '/'. + int lastSlashIndex = baseUri.lastIndexOf('/', baseIndices[QUERY] - 1); + int baseLimit = lastSlashIndex == -1 ? baseIndices[PATH] : lastSlashIndex + 1; + uri.append(baseUri, 0, baseLimit).append(referenceUri); + return removeDotSegments(uri, baseIndices[PATH], baseLimit + refIndices[QUERY]); + } + } + + /** + * Removes dot segments from the path of a URI. + * + * @param uri A {@link StringBuilder} containing the URI. + * @param offset The index of the start of the path in {@code uri}. + * @param limit The limit (exclusive) of the path in {@code uri}. + */ + private static String removeDotSegments(StringBuilder uri, int offset, int limit) { + if (offset >= limit) { + // Nothing to do. + return uri.toString(); + } + if (uri.charAt(offset) == '/') { + // If the path starts with a /, always retain it. + offset++; + } + // The first character of the current path segment. + int segmentStart = offset; + int i = offset; + while (i <= limit) { + int nextSegmentStart; + if (i == limit) { + nextSegmentStart = i; + } else if (uri.charAt(i) == '/') { + nextSegmentStart = i + 1; + } else { + i++; + continue; + } + // We've encountered the end of a segment or the end of the path. If the final segment was + // "." or "..", remove the appropriate segments of the path. + if (i == segmentStart + 1 && uri.charAt(segmentStart) == '.') { + // Given "abc/def/./ghi", remove "./" to get "abc/def/ghi". + uri.delete(segmentStart, nextSegmentStart); + limit -= nextSegmentStart - segmentStart; + i = segmentStart; + } else if (i == segmentStart + 2 && uri.charAt(segmentStart) == '.' && uri.charAt(segmentStart + 1) == '.') { + // Given "abc/def/../ghi", remove "def/../" to get "abc/ghi". + int prevSegmentStart = uri.lastIndexOf("/", segmentStart - 2) + 1; + int removeFrom = Math.max(prevSegmentStart, offset); + uri.delete(removeFrom, nextSegmentStart); + limit -= nextSegmentStart - removeFrom; + segmentStart = prevSegmentStart; + i = prevSegmentStart; + } else { + i++; + segmentStart = i; + } + } + return uri.toString(); + } + + /** + * Calculates indices of the constituent components of a URI. + * + * @param uriString The URI as a string. + * @return The corresponding indices. + */ + private static int[] getUriIndices(String uriString) { + int[] indices = new int[INDEX_COUNT]; + if (TextUtils.isEmpty(uriString)) { + indices[SCHEME_COLON] = -1; + return indices; + } + + // Determine outer structure from right to left. + // Uri = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + int length = uriString.length(); + int fragmentIndex = uriString.indexOf('#'); + if (fragmentIndex == -1) { + fragmentIndex = length; + } + int queryIndex = uriString.indexOf('?'); + if (queryIndex == -1 || queryIndex > fragmentIndex) { + // '#' before '?': '?' is within the fragment. + queryIndex = fragmentIndex; + } + // Slashes are allowed only in hier-part so any colon after the first slash is part of the + // hier-part, not the scheme colon separator. + int schemeIndexLimit = uriString.indexOf('/'); + if (schemeIndexLimit == -1 || schemeIndexLimit > queryIndex) { + schemeIndexLimit = queryIndex; + } + int schemeIndex = uriString.indexOf(':'); + if (schemeIndex > schemeIndexLimit) { + // '/' before ':' + schemeIndex = -1; + } + + // Determine hier-part structure: hier-part = "//" authority path / path + // This block can also cope with schemeIndex == -1. + boolean hasAuthority = schemeIndex + 2 < queryIndex && uriString.charAt(schemeIndex + 1) == '/' && uriString.charAt(schemeIndex + 2) == '/'; + int pathIndex; + if (hasAuthority) { + pathIndex = uriString.indexOf('/', schemeIndex + 3); // find first '/' after "://" + if (pathIndex == -1 || pathIndex > queryIndex) { + pathIndex = queryIndex; + } + } else { + pathIndex = schemeIndex + 1; + } + + indices[SCHEME_COLON] = schemeIndex; + indices[PATH] = pathIndex; + indices[QUERY] = queryIndex; + indices[FRAGMENT] = fragmentIndex; + return indices; + } +} diff --git a/jar/custom_spider.jar b/jar/custom_spider.jar index 34ca446..3784b64 100644 Binary files a/jar/custom_spider.jar and b/jar/custom_spider.jar differ diff --git a/jar/custom_spider.jar.md5 b/jar/custom_spider.jar.md5 index 74ce4ae..a64792b 100644 --- a/jar/custom_spider.jar.md5 +++ b/jar/custom_spider.jar.md5 @@ -1 +1 @@ -6fc8fb3791e3d877fa06ed91c23a77f4 +41d830d74afd5a31464b2f6678cd3f2e diff --git a/jar/genJar.bat b/jar/genJar.bat index 7e51a91..72e4055 100644 --- a/jar/genJar.bat +++ b/jar/genJar.bat @@ -6,12 +6,14 @@ rd /s/q "%~dp0\Smali_classes" java -jar "%~dp0\3rd\apktool_2.11.0.jar" d -f --only-main-classes "%~dp0\..\app\build\outputs\apk\release\app-release-unsigned.apk" -o "%~dp0\Smali_classes" rd /s/q "%~dp0\spider.jar\smali\com\github\catvod\spider" +rd /s/q "%~dp0\spider.jar\smali\com\github\catvod\js" rd /s/q "%~dp0\spider.jar\smali\org\slf4j\" if not exist "%~dp0\spider.jar\smali\com\github\catvod\" md "%~dp0\spider.jar\smali\com\github\catvod\" if not exist "%~dp0\spider.jar\smali\org\slf4j\" md "%~dp0\spider.jar\smali\org\slf4j\" move "%~dp0\Smali_classes\smali\com\github\catvod\spider" "%~dp0\spider.jar\smali\com\github\catvod\" +move "%~dp0\Smali_classes\smali\com\github\catvod\js" "%~dp0\spider.jar\smali\com\github\catvod\" move "%~dp0\Smali_classes\smali\org\slf4j" "%~dp0\spider.jar\smali\org\slf4j\" java -jar "%~dp0\3rd\apktool_2.11.0.jar" b "%~dp0\spider.jar" -c