Add Function for drpy

10 months ago · 6f03a1323d
parent 4fb8666f89
commit 6f03a1323d
11 changed files with 553 additions and 4 deletions
--- a/app/build.gradle
+++ b/app/build.gradle
@ -42,9 +42,9 @@ android {
 dependencies {
    implementation 'com.squareup.okhttp3:okhttp:' + okhttpVersion
    implementation 'com.github.thegrizzlylabs:sardine-android:0.9'
-    implementation 'wang.harlon.quickjs:wrapper-android:2.4.0'
-    implementation 'com.google.code.gson:gson:2.11.0'
+    implementation 'wang.harlon.quickjs:wrapper-android:2.4.3'
+    implementation 'com.google.code.gson:gson:2.12.1'
    implementation 'cn.wanghaomiao:JsoupXpath:2.5.1'
    implementation 'com.orhanobut:logger:2.2.0'
-    implementation 'org.jsoup:jsoup:1.15.3'
+    implementation 'org.jsoup:jsoup:1.17.2'
 }
--- a/app/proguard-rules.pro
+++ b/app/proguard-rules.pro
@ -15,6 +15,7 @@
 # Spider
 -keep class com.github.catvod.crawler.* { *; }
 -keep class com.github.catvod.spider.* { public <methods>; }
+-keep class com.github.catvod.js.Function { *; }

 # OkHttp
 -dontwarn okhttp3.**
--- a/app/src/main/java/com/github/catvod/js/Function.java
+++ b/app/src/main/java/com/github/catvod/js/Function.java
@ -0,0 +1,54 @@
+package com.github.catvod.js;
+
+import com.github.catvod.js.utils.Parser;
+import com.github.catvod.js.utils.JSUtil;
+import com.whl.quickjs.wrapper.JSArray;
+import com.whl.quickjs.wrapper.JSMethod;
+import com.whl.quickjs.wrapper.QuickJSContext;
+
+import java.lang.reflect.Method;
+
+public class Function {
+
+    private final QuickJSContext ctx;
+    private final Parser parser;
+
+    public Function(QuickJSContext ctx) {
+        this.parser = new Parser();
+        this.ctx = ctx;
+        setProperty();
+    }
+
+    private void setProperty() {
+        for (Method method : getClass().getMethods()) {
+            if (!method.isAnnotationPresent(JSMethod.class)) continue;
+            ctx.getGlobalObject().setProperty(method.getName(), args -> {
+                try {
+                    return method.invoke(this, args);
+                } catch (Exception e) {
+                    return null;
+                }
+            });
+        }
+    }
+
+    @JSMethod
+    public String pd(String html, String rule, String urlKey) {
+        return parser.parseDomForUrl(html, rule, urlKey);
+    }
+
+    @JSMethod
+    public String pdfh(String html, String rule) {
+        return parser.parseDomForUrl(html, rule, "");
+    }
+
+    @JSMethod
+    public JSArray pdfa(String html, String rule) {
+        return JSUtil.toArray(ctx, parser.parseDomForArray(html, rule));
+    }
+
+    @JSMethod
+    public JSArray pdfl(String html, String rule, String texts, String urls, String urlKey) {
+        return JSUtil.toArray(ctx, parser.parseDomForList(html, rule, texts, urls, urlKey));
+    }
+}
--- a/app/src/main/java/com/github/catvod/js/bean/Cache.java
+++ b/app/src/main/java/com/github/catvod/js/bean/Cache.java
@ -0,0 +1,32 @@
+package com.github.catvod.js.bean;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+public class Cache {
+
+    private String pdfhHtml;
+    private String pdfaHtml;
+    private Document pdfhDoc;
+    private Document pdfaDoc;
+
+    public Document getPdfh(String html) {
+        updatePdfh(html);
+        return pdfhDoc;
+    }
+
+    public Document getPdfa(String html) {
+        updatePdfa(html);
+        return pdfaDoc;
+    }
+
+    private void updatePdfh(String html) {
+        if (html.equals(pdfhHtml)) return;
+        pdfhDoc = Jsoup.parse(pdfhHtml = html);
+    }
+
+    private void updatePdfa(String html) {
+        if (html.equals(pdfaHtml)) return;
+        pdfaDoc = Jsoup.parse(pdfaHtml = html);
+    }
+}
--- a/app/src/main/java/com/github/catvod/js/bean/Info.java
+++ b/app/src/main/java/com/github/catvod/js/bean/Info.java
@ -0,0 +1,42 @@
+package com.github.catvod.js.bean;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class Info {
+
+    public int index;
+    public String rule;
+    public List<String> excludes;
+
+    public Info(String rule) {
+        this.rule = rule;
+    }
+
+    public void setRule(String rule) {
+        this.rule = rule;
+    }
+
+    public void setInfo(String pos) {
+        if (rule.contains("--")) {
+            String[] rules = rule.split("--");
+            setExcludes(rules);
+            setRule(rules[0]);
+        } else if (pos.contains("--")) {
+            String[] rules = pos.split("--");
+            setExcludes(rules);
+            pos = rules[0];
+        }
+        try {
+            index = Integer.parseInt(pos.replace("eq(", "").replace(")", ""));
+        } catch (Exception ignored) {
+            index = 0;
+        }
+    }
+
+    public void setExcludes(String[] rules) {
+        excludes = new ArrayList<>(Arrays.asList(rules));
+        excludes.remove(0);
+    }
+}
--- a/app/src/main/java/com/github/catvod/js/utils/JSUtil.java
+++ b/app/src/main/java/com/github/catvod/js/utils/JSUtil.java
@ -0,0 +1,16 @@
+package com.github.catvod.js.utils;
+
+import com.whl.quickjs.wrapper.JSArray;
+import com.whl.quickjs.wrapper.QuickJSContext;
+
+import java.util.List;
+
+public class JSUtil {
+
+    public static JSArray toArray(QuickJSContext ctx, List<String> items) {
+        JSArray array = ctx.createNewJSArray();
+        if (items == null || items.isEmpty()) return array;
+        for (int i = 0; i < items.size(); i++) array.set(items.get(i), i);
+        return array;
+    }
+}
--- a/app/src/main/java/com/github/catvod/js/utils/Parser.java
+++ b/app/src/main/java/com/github/catvod/js/utils/Parser.java
@ -0,0 +1,171 @@
+package com.github.catvod.js.utils;
+
+import android.text.TextUtils;
+
+import com.github.catvod.js.bean.Cache;
+import com.github.catvod.js.bean.Info;
+import com.github.catvod.utils.UriUtil;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class Parser {
+
+    private final Pattern URL = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL);
+    private final Pattern NO_ADD = Pattern.compile(":eq|:lt|:gt|:first|:last|:not|:even|:odd|:has|:contains|:matches|:empty|^body$|^#");
+    private final Pattern JOIN_URL = Pattern.compile("(url|src|href|-original|-src|-play|-url|style)$|^(data-|url-|src-)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
+    private final Pattern SPEC_URL = Pattern.compile("^(ftp|magnet|thunder|ws):", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
+
+    private final Cache cache;
+
+    public Parser() {
+        cache = new Cache();
+    }
+
+    private Info getParseInfo(String rule) {
+        Info info = new Info(rule);
+        if (rule.contains(":eq")) {
+            info.setRule(rule.split(":")[0]);
+            info.setInfo(rule.split(":")[1]);
+        } else if (rule.contains("--")) {
+            String[] rules = rule.split("--");
+            info.setExcludes(rules);
+            info.setRule(rules[0]);
+        }
+        return info;
+    }
+
+    private String parseHikerToJq(String parse, boolean first) {
+        if (!parse.contains("&&")) {
+            String[] split = parse.split(" ");
+            Matcher m = NO_ADD.matcher(split[split.length - 1]);
+            if (!m.find() && first) parse = parse + ":eq(0)";
+            return parse;
+        }
+        String[] parses = parse.split("&&");
+        List<String> items = new ArrayList<>();
+        for (int i = 0; i < parses.length; i++) {
+            String[] split = parses[i].split(" ");
+            if (NO_ADD.matcher(split[split.length - 1]).find()) {
+                items.add(parses[i]);
+            } else {
+                if (!first && i >= parses.length - 1) items.add(parses[i]);
+                else items.add(parses[i] + ":eq(0)");
+            }
+        }
+        return TextUtils.join(" ", items);
+    }
+
+    public String parseDomForUrl(String html, String rule, String addUrl) {
+        Document doc = cache.getPdfh(html);
+        if ("body&&Text".equals(rule) || "Text".equals(rule)) {
+            return doc.text();
+        } else if ("body&&Html".equals(rule) || "Html".equals(rule)) {
+            return doc.html();
+        }
+        String option = "";
+        if (rule.contains("&&")) {
+            String[] rs = rule.split("&&");
+            option = rs[rs.length - 1];
+            List<String> excludes = new ArrayList<>(Arrays.asList(rs));
+            excludes.remove(rs.length - 1);
+            rule = TextUtils.join("&&", excludes);
+        }
+        rule = parseHikerToJq(rule, true);
+        String[] parses = rule.split(" ");
+        Elements elements = new Elements();
+        for (String parse : parses) {
+            elements = parseOneRule(doc, parse, elements);
+            if (elements.isEmpty()) return "";
+        }
+        if (TextUtils.isEmpty(option)) return elements.outerHtml();
+        if ("Text".equals(option)) {
+            return elements.text();
+        } else if ("Html".equals(option)) {
+            return elements.html();
+        } else {
+            String result = "";
+            for (String s : option.split("[||]")) {
+                result = elements.attr(s);
+                if (s.toLowerCase().contains("style") && result.contains("url(")) {
+                    Matcher m = URL.matcher(result);
+                    if (m.find()) result = m.group(1);
+                    result = result.replaceAll("^['|\"](.*)['|\"]$", "$1");
+                }
+                if (!result.isEmpty() && !addUrl.isEmpty()) {
+                    if (JOIN_URL.matcher(s).find() && !SPEC_URL.matcher(result).find()) {
+                        if (result.contains("http")) {
+                            result = result.substring(result.indexOf("http"));
+                        } else {
+                            result = UriUtil.resolve(addUrl, result);
+                        }
+                    }
+                }
+                if (!result.isEmpty()) {
+                    return result;
+                }
+            }
+            return result;
+        }
+    }
+
+    public List<String> parseDomForArray(String html, String rule) {
+        Document doc = cache.getPdfa(html);
+        rule = parseHikerToJq(rule, false);
+        String[] parses = rule.split(" ");
+        Elements elements = new Elements();
+        for (String parse : parses) {
+            elements = parseOneRule(doc, parse, elements);
+            if (elements.isEmpty()) return new ArrayList<>();
+        }
+        List<String> items = new ArrayList<>();
+        for (Element element : elements) items.add(element.outerHtml());
+        return items;
+    }
+
+    private Elements parseOneRule(Document doc, String parse, Elements elements) {
+        Info info = getParseInfo(parse);
+        if (elements.isEmpty()) {
+            elements = doc.select(info.rule);
+        } else {
+            elements = elements.select(info.rule);
+        }
+        if (parse.contains(":eq")) {
+            if (info.index < 0) {
+                elements = elements.eq(elements.size() + info.index);
+            } else {
+                elements = elements.eq(info.index);
+            }
+        }
+        if (info.excludes != null && !elements.isEmpty()) {
+            elements = elements.clone();
+            for (int i = 0; i < info.excludes.size(); i++) {
+                elements.select(info.excludes.get(i)).remove();
+            }
+        }
+        return elements;
+    }
+
+    public List<String> parseDomForList(String html, String rule, String texts, String urls, String urlKey) {
+        String[] parses = parseHikerToJq(rule, false).split(" ");
+        Elements elements = new Elements();
+        for (String parse : parses) {
+            elements = parseOneRule(cache.getPdfa(html), parse, elements);
+            if (elements.isEmpty()) return Collections.emptyList();
+        }
+        List<String> items = new ArrayList<>();
+        for (Element element : elements) {
+            html = element.outerHtml();
+            items.add(parseDomForUrl(html, texts, "").trim() + '$' + parseDomForUrl(html, urls, urlKey));
+        }
+        return items;
+    }
+}
--- a/app/src/main/java/com/github/catvod/utils/UriUtil.java
+++ b/app/src/main/java/com/github/catvod/utils/UriUtil.java
@ -0,0 +1,231 @@
+package com.github.catvod.utils;
+
+import android.text.TextUtils;
+
+import androidx.annotation.Nullable;
+
+/**
+ * Utility methods for manipulating URIs.
+ */
+public final class UriUtil {
+
+    /**
+     * The length of arrays returned by {@link #getUriIndices(String)}.
+     */
+    private static final int INDEX_COUNT = 4;
+
+    /**
+     * An index into an array returned by {@link #getUriIndices(String)}.
+     *
+     * <p>The value at this position in the array is the index of the ':' after the scheme. Equals -1
+     * if the URI is a relative reference (no scheme). The hier-part starts at (schemeColon + 1),
+     * including when the URI has no scheme.
+     */
+    private static final int SCHEME_COLON = 0;
+
+    /**
+     * An index into an array returned by {@link #getUriIndices(String)}.
+     *
+     * <p>The value at this position in the array is the index of the path part. Equals (schemeColon +
+     * 1) if no authority part, (schemeColon + 3) if the authority part consists of just "//", and
+     * (query) if no path part. The characters starting at this index can be "//" only if the
+     * authority part is non-empty (in this case the double-slash means the first segment is empty).
+     */
+    private static final int PATH = 1;
+
+    /**
+     * An index into an array returned by {@link #getUriIndices(String)}.
+     *
+     * <p>The value at this position in the array is the index of the query part, including the '?'
+     * before the query. Equals fragment if no query part, and (fragment - 1) if the query part is a
+     * single '?' with no data.
+     */
+    private static final int QUERY = 2;
+
+    /**
+     * An index into an array returned by {@link #getUriIndices(String)}.
+     *
+     * <p>The value at this position in the array is the index of the fragment part, including the '#'
+     * before the fragment. Equal to the length of the URI if no fragment part, and (length - 1) if
+     * the fragment part is a single '#' with no data.
+     */
+    private static final int FRAGMENT = 3;
+
+    /**
+     * Performs relative resolution of a {@code referenceUri} with respect to a {@code baseUri}.
+     *
+     * <p>The resolution is performed as specified by RFC-3986.
+     *
+     * @param baseUri      The base URI.
+     * @param referenceUri The reference URI to resolve.
+     */
+    public static String resolve(@Nullable String baseUri, @Nullable String referenceUri) {
+        StringBuilder uri = new StringBuilder();
+
+        // Map null onto empty string, to make the following logic simpler.
+        baseUri = baseUri == null ? "" : baseUri;
+        referenceUri = referenceUri == null ? "" : referenceUri;
+
+        int[] refIndices = getUriIndices(referenceUri);
+        if (refIndices[SCHEME_COLON] != -1) {
+            // The reference is absolute. The target Uri is the reference.
+            uri.append(referenceUri);
+            removeDotSegments(uri, refIndices[PATH], refIndices[QUERY]);
+            return uri.toString();
+        }
+
+        int[] baseIndices = getUriIndices(baseUri);
+        if (refIndices[FRAGMENT] == 0) {
+            // The reference is empty or contains just the fragment part, then the target Uri is the
+            // concatenation of the base Uri without its fragment, and the reference.
+            return uri.append(baseUri, 0, baseIndices[FRAGMENT]).append(referenceUri).toString();
+        }
+
+        if (refIndices[QUERY] == 0) {
+            // The reference starts with the query part. The target is the base up to (but excluding) the
+            // query, plus the reference.
+            return uri.append(baseUri, 0, baseIndices[QUERY]).append(referenceUri).toString();
+        }
+
+        if (refIndices[PATH] != 0) {
+            // The reference has authority. The target is the base scheme plus the reference.
+            int baseLimit = baseIndices[SCHEME_COLON] + 1;
+            uri.append(baseUri, 0, baseLimit).append(referenceUri);
+            return removeDotSegments(uri, baseLimit + refIndices[PATH], baseLimit + refIndices[QUERY]);
+        }
+
+        if (referenceUri.charAt(refIndices[PATH]) == '/') {
+            // The reference path is rooted. The target is the base scheme and authority (if any), plus
+            // the reference.
+            uri.append(baseUri, 0, baseIndices[PATH]).append(referenceUri);
+            return removeDotSegments(uri, baseIndices[PATH], baseIndices[PATH] + refIndices[QUERY]);
+        }
+
+        // The target Uri is the concatenation of the base Uri up to (but excluding) the last segment,
+        // and the reference. This can be split into 2 cases:
+        if (baseIndices[SCHEME_COLON] + 2 < baseIndices[PATH] && baseIndices[PATH] == baseIndices[QUERY]) {
+            // Case 1: The base hier-part is just the authority, with an empty path. An additional '/' is
+            // needed after the authority, before appending the reference.
+            uri.append(baseUri, 0, baseIndices[PATH]).append('/').append(referenceUri);
+            return removeDotSegments(uri, baseIndices[PATH], baseIndices[PATH] + refIndices[QUERY] + 1);
+        } else {
+            // Case 2: Otherwise, find the last '/' in the base hier-part and append the reference after
+            // it. If base hier-part has no '/', it could only mean that it is completely empty or
+            // contains only one segment, in which case the whole hier-part is excluded and the reference
+            // is appended right after the base scheme colon without an added '/'.
+            int lastSlashIndex = baseUri.lastIndexOf('/', baseIndices[QUERY] - 1);
+            int baseLimit = lastSlashIndex == -1 ? baseIndices[PATH] : lastSlashIndex + 1;
+            uri.append(baseUri, 0, baseLimit).append(referenceUri);
+            return removeDotSegments(uri, baseIndices[PATH], baseLimit + refIndices[QUERY]);
+        }
+    }
+
+    /**
+     * Removes dot segments from the path of a URI.
+     *
+     * @param uri    A {@link StringBuilder} containing the URI.
+     * @param offset The index of the start of the path in {@code uri}.
+     * @param limit  The limit (exclusive) of the path in {@code uri}.
+     */
+    private static String removeDotSegments(StringBuilder uri, int offset, int limit) {
+        if (offset >= limit) {
+            // Nothing to do.
+            return uri.toString();
+        }
+        if (uri.charAt(offset) == '/') {
+            // If the path starts with a /, always retain it.
+            offset++;
+        }
+        // The first character of the current path segment.
+        int segmentStart = offset;
+        int i = offset;
+        while (i <= limit) {
+            int nextSegmentStart;
+            if (i == limit) {
+                nextSegmentStart = i;
+            } else if (uri.charAt(i) == '/') {
+                nextSegmentStart = i + 1;
+            } else {
+                i++;
+                continue;
+            }
+            // We've encountered the end of a segment or the end of the path. If the final segment was
+            // "." or "..", remove the appropriate segments of the path.
+            if (i == segmentStart + 1 && uri.charAt(segmentStart) == '.') {
+                // Given "abc/def/./ghi", remove "./" to get "abc/def/ghi".
+                uri.delete(segmentStart, nextSegmentStart);
+                limit -= nextSegmentStart - segmentStart;
+                i = segmentStart;
+            } else if (i == segmentStart + 2 && uri.charAt(segmentStart) == '.' && uri.charAt(segmentStart + 1) == '.') {
+                // Given "abc/def/../ghi", remove "def/../" to get "abc/ghi".
+                int prevSegmentStart = uri.lastIndexOf("/", segmentStart - 2) + 1;
+                int removeFrom = Math.max(prevSegmentStart, offset);
+                uri.delete(removeFrom, nextSegmentStart);
+                limit -= nextSegmentStart - removeFrom;
+                segmentStart = prevSegmentStart;
+                i = prevSegmentStart;
+            } else {
+                i++;
+                segmentStart = i;
+            }
+        }
+        return uri.toString();
+    }
+
+    /**
+     * Calculates indices of the constituent components of a URI.
+     *
+     * @param uriString The URI as a string.
+     * @return The corresponding indices.
+     */
+    private static int[] getUriIndices(String uriString) {
+        int[] indices = new int[INDEX_COUNT];
+        if (TextUtils.isEmpty(uriString)) {
+            indices[SCHEME_COLON] = -1;
+            return indices;
+        }
+
+        // Determine outer structure from right to left.
+        // Uri = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+        int length = uriString.length();
+        int fragmentIndex = uriString.indexOf('#');
+        if (fragmentIndex == -1) {
+            fragmentIndex = length;
+        }
+        int queryIndex = uriString.indexOf('?');
+        if (queryIndex == -1 || queryIndex > fragmentIndex) {
+            // '#' before '?': '?' is within the fragment.
+            queryIndex = fragmentIndex;
+        }
+        // Slashes are allowed only in hier-part so any colon after the first slash is part of the
+        // hier-part, not the scheme colon separator.
+        int schemeIndexLimit = uriString.indexOf('/');
+        if (schemeIndexLimit == -1 || schemeIndexLimit > queryIndex) {
+            schemeIndexLimit = queryIndex;
+        }
+        int schemeIndex = uriString.indexOf(':');
+        if (schemeIndex > schemeIndexLimit) {
+            // '/' before ':'
+            schemeIndex = -1;
+        }
+
+        // Determine hier-part structure: hier-part = "//" authority path / path
+        // This block can also cope with schemeIndex == -1.
+        boolean hasAuthority = schemeIndex + 2 < queryIndex && uriString.charAt(schemeIndex + 1) == '/' && uriString.charAt(schemeIndex + 2) == '/';
+        int pathIndex;
+        if (hasAuthority) {
+            pathIndex = uriString.indexOf('/', schemeIndex + 3); // find first '/' after "://"
+            if (pathIndex == -1 || pathIndex > queryIndex) {
+                pathIndex = queryIndex;
+            }
+        } else {
+            pathIndex = schemeIndex + 1;
+        }
+
+        indices[SCHEME_COLON] = schemeIndex;
+        indices[PATH] = pathIndex;
+        indices[QUERY] = queryIndex;
+        indices[FRAGMENT] = fragmentIndex;
+        return indices;
+    }
+}
--- a/jar/custom_spider.jar
+++ b/jar/custom_spider.jar
--- a/jar/custom_spider.jar.md5
+++ b/jar/custom_spider.jar.md5
@ -1 +1 @@
-6fc8fb3791e3d877fa06ed91c23a77f4
+41d830d74afd5a31464b2f6678cd3f2e
--- a/jar/genJar.bat
+++ b/jar/genJar.bat
@ -6,12 +6,14 @@ rd /s/q "%~dp0\Smali_classes"
 java -jar "%~dp0\3rd\apktool_2.11.0.jar" d -f --only-main-classes "%~dp0\..\app\build\outputs\apk\release\app-release-unsigned.apk" -o "%~dp0\Smali_classes"

 rd /s/q "%~dp0\spider.jar\smali\com\github\catvod\spider"
+rd /s/q "%~dp0\spider.jar\smali\com\github\catvod\js"
 rd /s/q "%~dp0\spider.jar\smali\org\slf4j\"

 if not exist "%~dp0\spider.jar\smali\com\github\catvod\" md "%~dp0\spider.jar\smali\com\github\catvod\"
 if not exist "%~dp0\spider.jar\smali\org\slf4j\" md "%~dp0\spider.jar\smali\org\slf4j\"

 move "%~dp0\Smali_classes\smali\com\github\catvod\spider" "%~dp0\spider.jar\smali\com\github\catvod\"
+move "%~dp0\Smali_classes\smali\com\github\catvod\js" "%~dp0\spider.jar\smali\com\github\catvod\"
 move "%~dp0\Smali_classes\smali\org\slf4j" "%~dp0\spider.jar\smali\org\slf4j\"

 java -jar "%~dp0\3rd\apktool_2.11.0.jar" b "%~dp0\spider.jar" -c