parent
4fb8666f89
commit
6f03a1323d
@ -0,0 +1,54 @@ |
||||
package com.github.catvod.js; |
||||
|
||||
import com.github.catvod.js.utils.Parser; |
||||
import com.github.catvod.js.utils.JSUtil; |
||||
import com.whl.quickjs.wrapper.JSArray; |
||||
import com.whl.quickjs.wrapper.JSMethod; |
||||
import com.whl.quickjs.wrapper.QuickJSContext; |
||||
|
||||
import java.lang.reflect.Method; |
||||
|
||||
public class Function { |
||||
|
||||
private final QuickJSContext ctx; |
||||
private final Parser parser; |
||||
|
||||
public Function(QuickJSContext ctx) { |
||||
this.parser = new Parser(); |
||||
this.ctx = ctx; |
||||
setProperty(); |
||||
} |
||||
|
||||
private void setProperty() { |
||||
for (Method method : getClass().getMethods()) { |
||||
if (!method.isAnnotationPresent(JSMethod.class)) continue; |
||||
ctx.getGlobalObject().setProperty(method.getName(), args -> { |
||||
try { |
||||
return method.invoke(this, args); |
||||
} catch (Exception e) { |
||||
return null; |
||||
} |
||||
}); |
||||
} |
||||
} |
||||
|
||||
@JSMethod |
||||
public String pd(String html, String rule, String urlKey) { |
||||
return parser.parseDomForUrl(html, rule, urlKey); |
||||
} |
||||
|
||||
@JSMethod |
||||
public String pdfh(String html, String rule) { |
||||
return parser.parseDomForUrl(html, rule, ""); |
||||
} |
||||
|
||||
@JSMethod |
||||
public JSArray pdfa(String html, String rule) { |
||||
return JSUtil.toArray(ctx, parser.parseDomForArray(html, rule)); |
||||
} |
||||
|
||||
@JSMethod |
||||
public JSArray pdfl(String html, String rule, String texts, String urls, String urlKey) { |
||||
return JSUtil.toArray(ctx, parser.parseDomForList(html, rule, texts, urls, urlKey)); |
||||
} |
||||
} |
||||
@ -0,0 +1,32 @@ |
||||
package com.github.catvod.js.bean; |
||||
|
||||
import org.jsoup.Jsoup; |
||||
import org.jsoup.nodes.Document; |
||||
|
||||
public class Cache { |
||||
|
||||
private String pdfhHtml; |
||||
private String pdfaHtml; |
||||
private Document pdfhDoc; |
||||
private Document pdfaDoc; |
||||
|
||||
public Document getPdfh(String html) { |
||||
updatePdfh(html); |
||||
return pdfhDoc; |
||||
} |
||||
|
||||
public Document getPdfa(String html) { |
||||
updatePdfa(html); |
||||
return pdfaDoc; |
||||
} |
||||
|
||||
private void updatePdfh(String html) { |
||||
if (html.equals(pdfhHtml)) return; |
||||
pdfhDoc = Jsoup.parse(pdfhHtml = html); |
||||
} |
||||
|
||||
private void updatePdfa(String html) { |
||||
if (html.equals(pdfaHtml)) return; |
||||
pdfaDoc = Jsoup.parse(pdfaHtml = html); |
||||
} |
||||
} |
||||
@ -0,0 +1,42 @@ |
||||
package com.github.catvod.js.bean; |
||||
|
||||
import java.util.ArrayList; |
||||
import java.util.Arrays; |
||||
import java.util.List; |
||||
|
||||
public class Info { |
||||
|
||||
public int index; |
||||
public String rule; |
||||
public List<String> excludes; |
||||
|
||||
public Info(String rule) { |
||||
this.rule = rule; |
||||
} |
||||
|
||||
public void setRule(String rule) { |
||||
this.rule = rule; |
||||
} |
||||
|
||||
public void setInfo(String pos) { |
||||
if (rule.contains("--")) { |
||||
String[] rules = rule.split("--"); |
||||
setExcludes(rules); |
||||
setRule(rules[0]); |
||||
} else if (pos.contains("--")) { |
||||
String[] rules = pos.split("--"); |
||||
setExcludes(rules); |
||||
pos = rules[0]; |
||||
} |
||||
try { |
||||
index = Integer.parseInt(pos.replace("eq(", "").replace(")", "")); |
||||
} catch (Exception ignored) { |
||||
index = 0; |
||||
} |
||||
} |
||||
|
||||
public void setExcludes(String[] rules) { |
||||
excludes = new ArrayList<>(Arrays.asList(rules)); |
||||
excludes.remove(0); |
||||
} |
||||
} |
||||
@ -0,0 +1,16 @@ |
||||
package com.github.catvod.js.utils; |
||||
|
||||
import com.whl.quickjs.wrapper.JSArray; |
||||
import com.whl.quickjs.wrapper.QuickJSContext; |
||||
|
||||
import java.util.List; |
||||
|
||||
public class JSUtil { |
||||
|
||||
public static JSArray toArray(QuickJSContext ctx, List<String> items) { |
||||
JSArray array = ctx.createNewJSArray(); |
||||
if (items == null || items.isEmpty()) return array; |
||||
for (int i = 0; i < items.size(); i++) array.set(items.get(i), i); |
||||
return array; |
||||
} |
||||
} |
||||
@ -0,0 +1,171 @@ |
||||
package com.github.catvod.js.utils; |
||||
|
||||
import android.text.TextUtils; |
||||
|
||||
import com.github.catvod.js.bean.Cache; |
||||
import com.github.catvod.js.bean.Info; |
||||
import com.github.catvod.utils.UriUtil; |
||||
|
||||
import org.jsoup.nodes.Document; |
||||
import org.jsoup.nodes.Element; |
||||
import org.jsoup.select.Elements; |
||||
|
||||
import java.util.ArrayList; |
||||
import java.util.Arrays; |
||||
import java.util.Collections; |
||||
import java.util.List; |
||||
import java.util.regex.Matcher; |
||||
import java.util.regex.Pattern; |
||||
|
||||
public class Parser { |
||||
|
||||
private final Pattern URL = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL); |
||||
private final Pattern NO_ADD = Pattern.compile(":eq|:lt|:gt|:first|:last|:not|:even|:odd|:has|:contains|:matches|:empty|^body$|^#"); |
||||
private final Pattern JOIN_URL = Pattern.compile("(url|src|href|-original|-src|-play|-url|style)$|^(data-|url-|src-)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); |
||||
private final Pattern SPEC_URL = Pattern.compile("^(ftp|magnet|thunder|ws):", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); |
||||
|
||||
private final Cache cache; |
||||
|
||||
public Parser() { |
||||
cache = new Cache(); |
||||
} |
||||
|
||||
private Info getParseInfo(String rule) { |
||||
Info info = new Info(rule); |
||||
if (rule.contains(":eq")) { |
||||
info.setRule(rule.split(":")[0]); |
||||
info.setInfo(rule.split(":")[1]); |
||||
} else if (rule.contains("--")) { |
||||
String[] rules = rule.split("--"); |
||||
info.setExcludes(rules); |
||||
info.setRule(rules[0]); |
||||
} |
||||
return info; |
||||
} |
||||
|
||||
private String parseHikerToJq(String parse, boolean first) { |
||||
if (!parse.contains("&&")) { |
||||
String[] split = parse.split(" "); |
||||
Matcher m = NO_ADD.matcher(split[split.length - 1]); |
||||
if (!m.find() && first) parse = parse + ":eq(0)"; |
||||
return parse; |
||||
} |
||||
String[] parses = parse.split("&&"); |
||||
List<String> items = new ArrayList<>(); |
||||
for (int i = 0; i < parses.length; i++) { |
||||
String[] split = parses[i].split(" "); |
||||
if (NO_ADD.matcher(split[split.length - 1]).find()) { |
||||
items.add(parses[i]); |
||||
} else { |
||||
if (!first && i >= parses.length - 1) items.add(parses[i]); |
||||
else items.add(parses[i] + ":eq(0)"); |
||||
} |
||||
} |
||||
return TextUtils.join(" ", items); |
||||
} |
||||
|
||||
public String parseDomForUrl(String html, String rule, String addUrl) { |
||||
Document doc = cache.getPdfh(html); |
||||
if ("body&&Text".equals(rule) || "Text".equals(rule)) { |
||||
return doc.text(); |
||||
} else if ("body&&Html".equals(rule) || "Html".equals(rule)) { |
||||
return doc.html(); |
||||
} |
||||
String option = ""; |
||||
if (rule.contains("&&")) { |
||||
String[] rs = rule.split("&&"); |
||||
option = rs[rs.length - 1]; |
||||
List<String> excludes = new ArrayList<>(Arrays.asList(rs)); |
||||
excludes.remove(rs.length - 1); |
||||
rule = TextUtils.join("&&", excludes); |
||||
} |
||||
rule = parseHikerToJq(rule, true); |
||||
String[] parses = rule.split(" "); |
||||
Elements elements = new Elements(); |
||||
for (String parse : parses) { |
||||
elements = parseOneRule(doc, parse, elements); |
||||
if (elements.isEmpty()) return ""; |
||||
} |
||||
if (TextUtils.isEmpty(option)) return elements.outerHtml(); |
||||
if ("Text".equals(option)) { |
||||
return elements.text(); |
||||
} else if ("Html".equals(option)) { |
||||
return elements.html(); |
||||
} else { |
||||
String result = ""; |
||||
for (String s : option.split("[||]")) { |
||||
result = elements.attr(s); |
||||
if (s.toLowerCase().contains("style") && result.contains("url(")) { |
||||
Matcher m = URL.matcher(result); |
||||
if (m.find()) result = m.group(1); |
||||
result = result.replaceAll("^['|\"](.*)['|\"]$", "$1"); |
||||
} |
||||
if (!result.isEmpty() && !addUrl.isEmpty()) { |
||||
if (JOIN_URL.matcher(s).find() && !SPEC_URL.matcher(result).find()) { |
||||
if (result.contains("http")) { |
||||
result = result.substring(result.indexOf("http")); |
||||
} else { |
||||
result = UriUtil.resolve(addUrl, result); |
||||
} |
||||
} |
||||
} |
||||
if (!result.isEmpty()) { |
||||
return result; |
||||
} |
||||
} |
||||
return result; |
||||
} |
||||
} |
||||
|
||||
public List<String> parseDomForArray(String html, String rule) { |
||||
Document doc = cache.getPdfa(html); |
||||
rule = parseHikerToJq(rule, false); |
||||
String[] parses = rule.split(" "); |
||||
Elements elements = new Elements(); |
||||
for (String parse : parses) { |
||||
elements = parseOneRule(doc, parse, elements); |
||||
if (elements.isEmpty()) return new ArrayList<>(); |
||||
} |
||||
List<String> items = new ArrayList<>(); |
||||
for (Element element : elements) items.add(element.outerHtml()); |
||||
return items; |
||||
} |
||||
|
||||
private Elements parseOneRule(Document doc, String parse, Elements elements) { |
||||
Info info = getParseInfo(parse); |
||||
if (elements.isEmpty()) { |
||||
elements = doc.select(info.rule); |
||||
} else { |
||||
elements = elements.select(info.rule); |
||||
} |
||||
if (parse.contains(":eq")) { |
||||
if (info.index < 0) { |
||||
elements = elements.eq(elements.size() + info.index); |
||||
} else { |
||||
elements = elements.eq(info.index); |
||||
} |
||||
} |
||||
if (info.excludes != null && !elements.isEmpty()) { |
||||
elements = elements.clone(); |
||||
for (int i = 0; i < info.excludes.size(); i++) { |
||||
elements.select(info.excludes.get(i)).remove(); |
||||
} |
||||
} |
||||
return elements; |
||||
} |
||||
|
||||
public List<String> parseDomForList(String html, String rule, String texts, String urls, String urlKey) { |
||||
String[] parses = parseHikerToJq(rule, false).split(" "); |
||||
Elements elements = new Elements(); |
||||
for (String parse : parses) { |
||||
elements = parseOneRule(cache.getPdfa(html), parse, elements); |
||||
if (elements.isEmpty()) return Collections.emptyList(); |
||||
} |
||||
List<String> items = new ArrayList<>(); |
||||
for (Element element : elements) { |
||||
html = element.outerHtml(); |
||||
items.add(parseDomForUrl(html, texts, "").trim() + '$' + parseDomForUrl(html, urls, urlKey)); |
||||
} |
||||
return items; |
||||
} |
||||
} |
||||
@ -0,0 +1,231 @@ |
||||
package com.github.catvod.utils; |
||||
|
||||
import android.text.TextUtils; |
||||
|
||||
import androidx.annotation.Nullable; |
||||
|
||||
/** |
||||
* Utility methods for manipulating URIs. |
||||
*/ |
||||
public final class UriUtil { |
||||
|
||||
/** |
||||
* The length of arrays returned by {@link #getUriIndices(String)}. |
||||
*/ |
||||
private static final int INDEX_COUNT = 4; |
||||
|
||||
/** |
||||
* An index into an array returned by {@link #getUriIndices(String)}. |
||||
* |
||||
* <p>The value at this position in the array is the index of the ':' after the scheme. Equals -1 |
||||
* if the URI is a relative reference (no scheme). The hier-part starts at (schemeColon + 1), |
||||
* including when the URI has no scheme. |
||||
*/ |
||||
private static final int SCHEME_COLON = 0; |
||||
|
||||
/** |
||||
* An index into an array returned by {@link #getUriIndices(String)}. |
||||
* |
||||
* <p>The value at this position in the array is the index of the path part. Equals (schemeColon + |
||||
* 1) if no authority part, (schemeColon + 3) if the authority part consists of just "//", and |
||||
* (query) if no path part. The characters starting at this index can be "//" only if the |
||||
* authority part is non-empty (in this case the double-slash means the first segment is empty). |
||||
*/ |
||||
private static final int PATH = 1; |
||||
|
||||
/** |
||||
* An index into an array returned by {@link #getUriIndices(String)}. |
||||
* |
||||
* <p>The value at this position in the array is the index of the query part, including the '?' |
||||
* before the query. Equals fragment if no query part, and (fragment - 1) if the query part is a |
||||
* single '?' with no data. |
||||
*/ |
||||
private static final int QUERY = 2; |
||||
|
||||
/** |
||||
* An index into an array returned by {@link #getUriIndices(String)}. |
||||
* |
||||
* <p>The value at this position in the array is the index of the fragment part, including the '#' |
||||
* before the fragment. Equal to the length of the URI if no fragment part, and (length - 1) if |
||||
* the fragment part is a single '#' with no data. |
||||
*/ |
||||
private static final int FRAGMENT = 3; |
||||
|
||||
/** |
||||
* Performs relative resolution of a {@code referenceUri} with respect to a {@code baseUri}. |
||||
* |
||||
* <p>The resolution is performed as specified by RFC-3986. |
||||
* |
||||
* @param baseUri The base URI. |
||||
* @param referenceUri The reference URI to resolve. |
||||
*/ |
||||
public static String resolve(@Nullable String baseUri, @Nullable String referenceUri) { |
||||
StringBuilder uri = new StringBuilder(); |
||||
|
||||
// Map null onto empty string, to make the following logic simpler.
|
||||
baseUri = baseUri == null ? "" : baseUri; |
||||
referenceUri = referenceUri == null ? "" : referenceUri; |
||||
|
||||
int[] refIndices = getUriIndices(referenceUri); |
||||
if (refIndices[SCHEME_COLON] != -1) { |
||||
// The reference is absolute. The target Uri is the reference.
|
||||
uri.append(referenceUri); |
||||
removeDotSegments(uri, refIndices[PATH], refIndices[QUERY]); |
||||
return uri.toString(); |
||||
} |
||||
|
||||
int[] baseIndices = getUriIndices(baseUri); |
||||
if (refIndices[FRAGMENT] == 0) { |
||||
// The reference is empty or contains just the fragment part, then the target Uri is the
|
||||
// concatenation of the base Uri without its fragment, and the reference.
|
||||
return uri.append(baseUri, 0, baseIndices[FRAGMENT]).append(referenceUri).toString(); |
||||
} |
||||
|
||||
if (refIndices[QUERY] == 0) { |
||||
// The reference starts with the query part. The target is the base up to (but excluding) the
|
||||
// query, plus the reference.
|
||||
return uri.append(baseUri, 0, baseIndices[QUERY]).append(referenceUri).toString(); |
||||
} |
||||
|
||||
if (refIndices[PATH] != 0) { |
||||
// The reference has authority. The target is the base scheme plus the reference.
|
||||
int baseLimit = baseIndices[SCHEME_COLON] + 1; |
||||
uri.append(baseUri, 0, baseLimit).append(referenceUri); |
||||
return removeDotSegments(uri, baseLimit + refIndices[PATH], baseLimit + refIndices[QUERY]); |
||||
} |
||||
|
||||
if (referenceUri.charAt(refIndices[PATH]) == '/') { |
||||
// The reference path is rooted. The target is the base scheme and authority (if any), plus
|
||||
// the reference.
|
||||
uri.append(baseUri, 0, baseIndices[PATH]).append(referenceUri); |
||||
return removeDotSegments(uri, baseIndices[PATH], baseIndices[PATH] + refIndices[QUERY]); |
||||
} |
||||
|
||||
// The target Uri is the concatenation of the base Uri up to (but excluding) the last segment,
|
||||
// and the reference. This can be split into 2 cases:
|
||||
if (baseIndices[SCHEME_COLON] + 2 < baseIndices[PATH] && baseIndices[PATH] == baseIndices[QUERY]) { |
||||
// Case 1: The base hier-part is just the authority, with an empty path. An additional '/' is
|
||||
// needed after the authority, before appending the reference.
|
||||
uri.append(baseUri, 0, baseIndices[PATH]).append('/').append(referenceUri); |
||||
return removeDotSegments(uri, baseIndices[PATH], baseIndices[PATH] + refIndices[QUERY] + 1); |
||||
} else { |
||||
// Case 2: Otherwise, find the last '/' in the base hier-part and append the reference after
|
||||
// it. If base hier-part has no '/', it could only mean that it is completely empty or
|
||||
// contains only one segment, in which case the whole hier-part is excluded and the reference
|
||||
// is appended right after the base scheme colon without an added '/'.
|
||||
int lastSlashIndex = baseUri.lastIndexOf('/', baseIndices[QUERY] - 1); |
||||
int baseLimit = lastSlashIndex == -1 ? baseIndices[PATH] : lastSlashIndex + 1; |
||||
uri.append(baseUri, 0, baseLimit).append(referenceUri); |
||||
return removeDotSegments(uri, baseIndices[PATH], baseLimit + refIndices[QUERY]); |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Removes dot segments from the path of a URI. |
||||
* |
||||
* @param uri A {@link StringBuilder} containing the URI. |
||||
* @param offset The index of the start of the path in {@code uri}. |
||||
* @param limit The limit (exclusive) of the path in {@code uri}. |
||||
*/ |
||||
private static String removeDotSegments(StringBuilder uri, int offset, int limit) { |
||||
if (offset >= limit) { |
||||
// Nothing to do.
|
||||
return uri.toString(); |
||||
} |
||||
if (uri.charAt(offset) == '/') { |
||||
// If the path starts with a /, always retain it.
|
||||
offset++; |
||||
} |
||||
// The first character of the current path segment.
|
||||
int segmentStart = offset; |
||||
int i = offset; |
||||
while (i <= limit) { |
||||
int nextSegmentStart; |
||||
if (i == limit) { |
||||
nextSegmentStart = i; |
||||
} else if (uri.charAt(i) == '/') { |
||||
nextSegmentStart = i + 1; |
||||
} else { |
||||
i++; |
||||
continue; |
||||
} |
||||
// We've encountered the end of a segment or the end of the path. If the final segment was
|
||||
// "." or "..", remove the appropriate segments of the path.
|
||||
if (i == segmentStart + 1 && uri.charAt(segmentStart) == '.') { |
||||
// Given "abc/def/./ghi", remove "./" to get "abc/def/ghi".
|
||||
uri.delete(segmentStart, nextSegmentStart); |
||||
limit -= nextSegmentStart - segmentStart; |
||||
i = segmentStart; |
||||
} else if (i == segmentStart + 2 && uri.charAt(segmentStart) == '.' && uri.charAt(segmentStart + 1) == '.') { |
||||
// Given "abc/def/../ghi", remove "def/../" to get "abc/ghi".
|
||||
int prevSegmentStart = uri.lastIndexOf("/", segmentStart - 2) + 1; |
||||
int removeFrom = Math.max(prevSegmentStart, offset); |
||||
uri.delete(removeFrom, nextSegmentStart); |
||||
limit -= nextSegmentStart - removeFrom; |
||||
segmentStart = prevSegmentStart; |
||||
i = prevSegmentStart; |
||||
} else { |
||||
i++; |
||||
segmentStart = i; |
||||
} |
||||
} |
||||
return uri.toString(); |
||||
} |
||||
|
||||
/** |
||||
* Calculates indices of the constituent components of a URI. |
||||
* |
||||
* @param uriString The URI as a string. |
||||
* @return The corresponding indices. |
||||
*/ |
||||
private static int[] getUriIndices(String uriString) { |
||||
int[] indices = new int[INDEX_COUNT]; |
||||
if (TextUtils.isEmpty(uriString)) { |
||||
indices[SCHEME_COLON] = -1; |
||||
return indices; |
||||
} |
||||
|
||||
// Determine outer structure from right to left.
|
||||
// Uri = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
|
||||
int length = uriString.length(); |
||||
int fragmentIndex = uriString.indexOf('#'); |
||||
if (fragmentIndex == -1) { |
||||
fragmentIndex = length; |
||||
} |
||||
int queryIndex = uriString.indexOf('?'); |
||||
if (queryIndex == -1 || queryIndex > fragmentIndex) { |
||||
// '#' before '?': '?' is within the fragment.
|
||||
queryIndex = fragmentIndex; |
||||
} |
||||
// Slashes are allowed only in hier-part so any colon after the first slash is part of the
|
||||
// hier-part, not the scheme colon separator.
|
||||
int schemeIndexLimit = uriString.indexOf('/'); |
||||
if (schemeIndexLimit == -1 || schemeIndexLimit > queryIndex) { |
||||
schemeIndexLimit = queryIndex; |
||||
} |
||||
int schemeIndex = uriString.indexOf(':'); |
||||
if (schemeIndex > schemeIndexLimit) { |
||||
// '/' before ':'
|
||||
schemeIndex = -1; |
||||
} |
||||
|
||||
// Determine hier-part structure: hier-part = "//" authority path / path
|
||||
// This block can also cope with schemeIndex == -1.
|
||||
boolean hasAuthority = schemeIndex + 2 < queryIndex && uriString.charAt(schemeIndex + 1) == '/' && uriString.charAt(schemeIndex + 2) == '/'; |
||||
int pathIndex; |
||||
if (hasAuthority) { |
||||
pathIndex = uriString.indexOf('/', schemeIndex + 3); // find first '/' after "://"
|
||||
if (pathIndex == -1 || pathIndex > queryIndex) { |
||||
pathIndex = queryIndex; |
||||
} |
||||
} else { |
||||
pathIndex = schemeIndex + 1; |
||||
} |
||||
|
||||
indices[SCHEME_COLON] = schemeIndex; |
||||
indices[PATH] = pathIndex; |
||||
indices[QUERY] = queryIndex; |
||||
indices[FRAGMENT] = fragmentIndex; |
||||
return indices; |
||||
} |
||||
} |
||||
Binary file not shown.
@ -1 +1 @@ |
||||
6fc8fb3791e3d877fa06ed91c23a77f4 |
||||
41d830d74afd5a31464b2f6678cd3f2e |
||||
|
||||
Loading…
Reference in new issue