From c42e4118eaa6b6a63a759426966521a380a50681 Mon Sep 17 00:00:00 2001 From: Kornel Date: Mon, 13 Jan 2025 20:14:30 +0100 Subject: [PATCH] Preserve case and ; when replacing url(;base64) Fixes #10 --- src/lib.rs | 112 +++++++++++++++++++-------------------------- tests/filtered.xml | 11 +++-- tests/test.xml | 1 + 3 files changed, 54 insertions(+), 70 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c19a76e..f1c6c83 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -520,67 +520,47 @@ impl Filter { /// and enabling filter bypass (e.g. if our tokenizer saw a comment but browsers didn't). /// Proper implementation needs browser-grade parsers, and that's a big ask for a small tool. /// - /// So here's a brute approach instead: throwing out everything that isn't obviously trivial. - /// We don't really need to support SVG files that rely on weird tricky syntax. We can mangle all - /// the edge cases as long as we're erring on the side of safety. + /// So here's a brute approach instead: replacing everything that could possibly be a `url()`, + /// and throwing away everything that looks tricky to parse. fn filtered_url_func(&self, css_or_svg: &str) -> String { - // If we see something suspicious we'll try to remove it instead of throwing away the whole stylesheet/attribute. - css_or_svg.split_inclusive([';', '{', '}', ',']) - .filter_map(|full_chunk| { - // CSS allows escapes in identifiers! No escape schenanigans allowed, as this makes matching require a proper tokenizer. - if full_chunk.contains('\\') { - // TODO: could perform unescaping instead - return None; - } - - // without escapes it's reasonable to search for identifiers now (lowercase to make them case-insensitive) - let chunk_lower = full_chunk.to_ascii_lowercase(); - // @rules don't allow extra whitespace, so apart from escapes, they can't be obfuscated - if chunk_lower.contains("@import") { - return None; - } + let mut out = String::with_capacity(css_or_svg.len()); + + // find next potential start of a URL (case-insensitive) + let mut inside_url = false; + for mut chunk in css_or_svg.split_inclusive('(') { + if inside_url { + // unescaped ( in URLs is not allowed, keep skipping until url( closes + let Some((url, rest)) = chunk.split_once(')') else { + continue; + }; + chunk = rest; + + let url = url.trim() + .trim_start_matches(['"', '\'']).trim_end_matches(['"', '\'']) + // the quoted value allows whitespace too! + .trim(); + + let url = self.filter_url(url) + // no tricky chars please. + .filter(|url| !url.contains(['(', ')', '\'', '"', '\\', ' '])); + // the url(#) must stay to keep the syntax valid + out.push_str(&url.as_deref().unwrap_or("#")); + out.push(')'); + } - // `url()` doesn't allow whitespace before '(' - if chunk_lower.contains("url(") { - let mut out = String::with_capacity(chunk_lower.len()); - // this is like split(), but with the match done on lowercased string for case-insensitivity. Indices are used to get original case back. - // this works thanks to to_ascii_lowercase preserving offsets. - let mut last_idx = 0; - let mut url_chunks = chunk_lower.match_indices("url(").chain(Some((full_chunk.len(), ""))) - .map(|(idx, frag)| { - let s = &full_chunk[last_idx..idx]; - last_idx = idx+frag.len(); - s - }); - out.push_str(url_chunks.next().expect("first")); // text before `url(` - while let Some(url_chunk) = url_chunks.next() { - // Too bad if you use unescaped () in your URL - let mut urlfunc_parts = url_chunk.split(')'); - let urlfunc = urlfunc_parts.next().expect("first"); - let after_urlfunc = urlfunc_parts.next()?; // if it's none, it's `url(url(`, which we don't want - let url = urlfunc.trim() - // we've already established there are no escape chars in the chunk, - // so there can't be any tricky things inside the string. - // Sorry to everyone who uses unescaped quote chars at the end of their URLs. - .trim_start_matches(['"', '\'']).trim_end_matches(['"', '\'']) - // the quoted value allows whitespace too! - .trim(); - let url = self.filter_url(url) - // no tricky chars please. For SVG it's important to keep url() even if the URL inside it is bogus - // because otherwise properties like fill would default to black instead of transparent - .filter(|url| !url.contains(['(',')','\'','"','\\'])); - let url = url.as_deref().unwrap_or("#"); - out.push_str("url("); - out.push_str(url); - out.push(')'); - out.push_str(after_urlfunc); // it's important to keep the terminator + for chunk in chunk.split_inclusive([';', '{', '}', ',']) { + if chunk.contains('\\') || (chunk.contains('@') && chunk.to_ascii_lowercase().contains("@import")) { + if let Some(b @ (b'{' | b'}')) = chunk.as_bytes().last().copied() { + out.push(b as char); } - return Some(Cow::Owned(out)); + continue; } - - Some(Cow::Borrowed(full_chunk)) - }) - .collect() + out.push_str(chunk); + // `url()` doesn't allow whitespace before '(' + inside_url = chunk.len() >= 4 && chunk.get(chunk.len() - 4..).is_some_and(|c| c.eq_ignore_ascii_case("url(")); + } + } + out } } @@ -593,28 +573,30 @@ fn urlfunc() { assert_eq!(f.filtered_url_func("hello, url( http://evil.com )"), "hello, url(#)"); assert_eq!(f.filtered_url_func("hello, url( //evil.com )"), "hello, url(#)"); assert_eq!(f.filtered_url_func("url( /okay ), (), bye"), "url(/okay), (), bye"); - assert_eq!(f.filtered_url_func("hello, url( unclosed"), "hello,"); + assert_eq!(f.filtered_url_func("hello, url( unclosed"), "hello, url("); assert_eq!(f.filtered_url_func("hello, url( ) url( ) bork"), "hello, url() url() bork"); assert_eq!(f.filtered_url_func("hello, url('1' )url( 2 ) bork"), "hello, url(1)url(2) bork"); - assert_eq!(f.filtered_url_func("hello, url('(' )"), "hello, url(%28)"); + assert_eq!(f.filtered_url_func("hello, url('(' )"), "hello, url()"); } #[test] fn css() { let f = Filter::new(); - assert_eq!(f.filtered_url_func("color: red; background: URL(X); huh"), "color: red; background: url(X); huh"); + assert_eq!(f.filtered_url_func("ok:url(data:text/plain;base64,AAA)"), "ok:url(#)"); + assert_eq!(f.filtered_url_func("color: red; background: URL(X); huh"), "color: red; background: URL(X); huh"); assert_eq!(f.filtered_url_func("@import 'foo'; FONT-size: 1em;"), " FONT-size: 1em;"); assert_eq!(f.filtered_url_func("u;url(data:x);rl(hack)"), "u;url(#);rl(hack)"); assert_eq!(f.filtered_url_func("u;\\;rl(hack)"), "u;rl(hack)"); assert_eq!(f.filtered_url_func("u;\\,rl(hack)"), "u;rl(hack)"); - assert_eq!(f.filtered_url_func("u;\\url(hack)"), "u;"); + assert_eq!(f.filtered_url_func("u;\\url(hack)"), "u;hack)"); assert_eq!(f.filtered_url_func("font-size: 1em; @Import 'foo';"), "font-size: 1em;"); assert_eq!(f.filtered_url_func("@\\69MporT 'foo';"), ""); - assert_eq!(f.filtered_url_func("color: red; background: URL( url(); huh)"), "color: red; huh)"); - assert_eq!(f.filtered_url_func("color: red; background: URL(//x/rel);"), "color: red; background: url(/rel);"); - assert_eq!(f.filtered_url_func("color: red; background: URL(data:xx);"), "color: red; background: url(#);"); - assert_eq!(f.filtered_url_func("color: red; background: UR\\L(x); border: blue"), "color: red; border: blue"); + assert_eq!(f.filtered_url_func("color: red; background: URL( url(); huh)"), "color: red; background: URL(); huh)"); + assert_eq!(f.filtered_url_func("color: red; background: URL(//x/rel);"), "color: red; background: URL(/rel);"); + assert_eq!(f.filtered_url_func("color: red; background: URL(data:xx);"), "color: red; background: URL(#);"); + assert_eq!(f.filtered_url_func("color: red; background: UR\\L(x); border: blue"), "color: red;x); border: blue"); assert_eq!(f.filtered_url_func("prop: url (it is not);"), "prop: url (it is not);"); + assert_eq!(f.filtered_url_func("sel, sel\\2 { bg: url(data:x); ba\\d; }"), "sel,{ bg: url(#); }"); } #[test] diff --git a/tests/filtered.xml b/tests/filtered.xml index a39f7d0..6b5c5c1 100644 --- a/tests/filtered.xml +++ b/tests/filtered.xml @@ -7,11 +7,11 @@ - - - - - + + + + + @@ -19,6 +19,7 @@ + diff --git a/tests/test.xml b/tests/test.xml index a2e0827..6ab0994 100644 --- a/tests/test.xml +++ b/tests/test.xml @@ -20,6 +20,7 @@ +