Update hiker rule parser

2 years ago · 9c3db0c3d5
parent 1bb8e21b91
commit 9c3db0c3d5
3 changed files with 71 additions and 73 deletions
--- a/quickjs/src/main/java/com/fongmi/quickjs/bean/Info.java
+++ b/quickjs/src/main/java/com/fongmi/quickjs/bean/Info.java
@ -29,8 +29,9 @@ public class Info {
            pos = rules[0];
        }
        try {
-            index = Integer.parseInt(pos.split("\\(")[1].split("\\)")[0]);
+            index = Integer.parseInt(pos.replace("eq(", "").replace(")", ""));
        } catch (Exception ignored) {
+            index = 0;
        }
    }

--- a/quickjs/src/main/java/com/fongmi/quickjs/method/Global.java
+++ b/quickjs/src/main/java/com/fongmi/quickjs/method/Global.java
@ -2,6 +2,7 @@ package com.fongmi.quickjs.method;

 import androidx.annotation.Keep;
 import androidx.annotation.NonNull;
+import androidx.media3.common.util.UriUtil;

 import com.fongmi.quickjs.bean.Req;
 import com.fongmi.quickjs.utils.Connect;
@ -121,31 +122,31 @@ public class Global {
    @Keep
    @JSMethod
    public String pd(String html, String rule, String urlKey) {
-        return parser.pdfh(html, rule, urlKey);
+        return parser.parseDomForUrl(html, rule, urlKey);
    }

    @Keep
    @JSMethod
    public String pdfh(String html, String rule) {
-        return parser.pdfh(html, rule, "");
+        return parser.parseDomForUrl(html, rule, "");
    }

    @Keep
    @JSMethod
    public JSArray pdfa(String html, String rule) {
-        return JSUtil.toArray(ctx, parser.pdfa(html, rule));
+        return JSUtil.toArray(ctx, parser.parseDomForArray(html, rule));
    }

    @Keep
    @JSMethod
    public JSArray pdfl(String html, String rule, String texts, String urls, String urlKey) {
-        return JSUtil.toArray(ctx, parser.pdfl(html, rule, texts, urls, urlKey));
+        return JSUtil.toArray(ctx, parser.parseDomForList(html, rule, texts, urls, urlKey));
    }

    @Keep
    @JSMethod
    public String joinUrl(String parent, String child) {
-        return parser.joinUrl(parent, child);
+        return UriUtil.resolve(parent, child);
    }

    @Keep
--- a/quickjs/src/main/java/com/fongmi/quickjs/utils/Parser.java
+++ b/quickjs/src/main/java/com/fongmi/quickjs/utils/Parser.java
@ -20,7 +20,7 @@ import java.util.regex.Pattern;

 public class Parser {

-    private final Pattern p1 = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL);
+    private final Pattern URL = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL);
    private final Pattern NO_ADD = Pattern.compile(":eq|:lt|:gt|:first|:last|:not|:even|:odd|:has|:contains|:matches|:empty|^body$|^#");
    private final Pattern JOIN_URL = Pattern.compile("(url|src|href|-original|-src|-play|-url|style)$|^(data-|url-|src-)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
    private final Pattern SPEC_URL = Pattern.compile("^(ftp|magnet|thunder|ws):", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
@ -47,7 +47,9 @@ public class Parser {
    private String parseHikerToJq(String parse, boolean first) {
        if (!parse.contains("&&")) {
            String[] split = parse.split(" ");
-            return (NO_ADD.matcher(split[split.length - 1]).find() || !first) ? parse : parse + ":eq(0)";
+            Matcher m = NO_ADD.matcher(split[split.length - 1]);
+            if (!m.find() && first) parse = parse + ":eq(0)";
+            return parse;
        }
        String[] parses = parse.split("&&");
        List<String> items = new ArrayList<>();
@ -63,59 +65,7 @@ public class Parser {
        return TextUtils.join(" ", items);
    }

-    private Elements parseOneRule(Document doc, String parse, Elements elements) {
-        Info info = getParseInfo(parse);
-        if (parse.contains(":eq")) {
-            if (elements.isEmpty()) {
-                if (info.index < 0) {
-                    Elements r = doc.select(info.rule);
-                    elements = r.eq(r.size() + info.index);
-                } else {
-                    elements = doc.select(info.rule).eq(info.index);
-                }
-            } else {
-                if (info.index < 0) {
-                    Elements r = elements.select(info.rule);
-                    elements = r.eq(r.size() + info.index);
-                } else {
-                    elements = elements.select(info.rule).eq(info.index);
-                }
-            }
-        } else {
-            if (elements.isEmpty()) {
-                elements = doc.select(parse);
-            } else {
-                elements = elements.select(parse);
-            }
-        }
-        if (info.excludes != null && !elements.isEmpty()) {
-            elements = elements.clone();
-            for (String exclude : info.excludes) {
-                elements.select(exclude).remove();
-            }
-        }
-        return elements;
-    }
-
-    public String joinUrl(String parent, String child) {
-        return UriUtil.resolve(parent, child);
-    }
-
-    public List<String> pdfa(String html, String rule) {
-        Document doc = cache.getPdfa(html);
-        rule = parseHikerToJq(rule, false);
-        String[] parses = rule.split(" ");
-        Elements elements = new Elements();
-        for (String parse : parses) {
-            elements = parseOneRule(doc, parse, elements);
-            if (elements.isEmpty()) return Collections.emptyList();
-        }
-        List<String> items = new ArrayList<>();
-        for (Element element : elements) items.add(element.outerHtml());
-        return items;
-    }
-
-    public String pdfh(String html, String rule, String addUrl) {
+    public String parseDomForUrl(String html, String rule, String addUrl) {
        Document doc = cache.getPdfh(html);
        if ("body&&Text".equals(rule) || "Text".equals(rule)) {
            return doc.text();
@ -143,23 +93,69 @@ public class Parser {
        } else if ("Html".equals(option)) {
            return elements.html();
        } else {
-            String result = elements.attr(option);
-            if (option.toLowerCase().contains("style") && result.contains("url(")) {
-                Matcher matcher = p1.matcher(result);
-                if (matcher.find()) result = matcher.group(1);
-                if (result != null) result = result.replaceAll("^['|\"](.*)['|\"]$", "$1");
-            }
-            if (!TextUtils.isEmpty(result) && !TextUtils.isEmpty(addUrl)) {
-                if (JOIN_URL.matcher(option).find() && !SPEC_URL.matcher(result).find()) {
-                    if (result.contains("http")) result = result.substring(result.indexOf("http"));
-                    else result = joinUrl(addUrl, result);
+            String result = "";
+            for (String s : option.split("[||]")) {
+                result = elements.attr(s);
+                if (s.toLowerCase().contains("style") && result.contains("url(")) {
+                    Matcher m = URL.matcher(result);
+                    if (m.find()) result = m.group(1);
+                    result = result.replaceAll("^['|\"](.*)['|\"]$", "$1");
+                }
+                if (!result.isEmpty() && !addUrl.isEmpty()) {
+                    if (JOIN_URL.matcher(s).find() && !SPEC_URL.matcher(result).find()) {
+                        if (result.contains("http")) {
+                            result = result.substring(result.indexOf("http"));
+                        } else {
+                            result = UriUtil.resolve(addUrl, result);
+                        }
+                    }
+                }
+                if (!result.isEmpty()) {
+                    return result;
                }
            }
            return result;
        }
    }

-    public List<String> pdfl(String html, String rule, String texts, String urls, String urlKey) {
+    public List<String> parseDomForArray(String html, String rule) {
+        Document doc = cache.getPdfa(html);
+        rule = parseHikerToJq(rule, false);
+        String[] parses = rule.split(" ");
+        Elements elements = new Elements();
+        for (String parse : parses) {
+            elements = parseOneRule(doc, parse, elements);
+            if (elements.isEmpty()) return new ArrayList<>();
+        }
+        List<String> items = new ArrayList<>();
+        for (Element element : elements) items.add(element.outerHtml());
+        return items;
+    }
+
+    private Elements parseOneRule(Document doc, String parse, Elements elements) {
+        Info info = getParseInfo(parse);
+        if (elements.isEmpty()) {
+            elements = doc.select(info.rule);
+        } else {
+            elements = elements.select(info.rule);
+        }
+        if (parse.contains(":eq")) {
+            if (info.index < 0) {
+                elements = elements.eq(elements.size() + info.index);
+            } else {
+                elements = elements.eq(info.index);
+            }
+        }
+        if (info.excludes != null && !elements.isEmpty()) {
+            elements = elements.clone();
+            for (int i = 0; i < info.excludes.size(); i++) {
+                elements.select(info.excludes.get(i)).remove();
+            }
+        }
+        return elements;
+    }
+
+    public List<String> parseDomForList(String html, String rule, String texts, String urls, String urlKey) {
        String[] parses = parseHikerToJq(rule, false).split(" ");
        Elements elements = new Elements();
        for (String parse : parses) {
@ -169,7 +165,7 @@ public class Parser {
        List<String> items = new ArrayList<>();
        for (Element element : elements) {
            html = element.outerHtml();
-            items.add(pdfh(html, texts, "").trim() + '$' + pdfh(html, urls, urlKey));
+            items.add(parseDomForUrl(html, texts, "").trim() + '$' + parseDomForUrl(html, urls, urlKey));
        }
        return items;
    }