From 026cf75690cbbc300b1a0ded70ec534ebb687044 Mon Sep 17 00:00:00 2001 From: Martin Algesten Date: Sun, 14 Mar 2021 16:43:59 +0100 Subject: [PATCH] Handle non-utf8 status and headers Non-utf8 headers are ignored and reading the value for them will yield `None`. --- src/header.rs | 203 +++++++++++++++++++++++++++++++++++++----------- src/response.rs | 87 ++++++++++++--------- src/unit.rs | 4 +- 3 files changed, 212 insertions(+), 82 deletions(-) diff --git a/src/header.rs b/src/header.rs index 122cee2..342f1c1 100644 --- a/src/header.rs +++ b/src/header.rs @@ -1,6 +1,66 @@ use crate::error::{Error, ErrorKind}; use std::fmt; -use std::str::FromStr; +use std::str::{from_utf8, FromStr}; + +/// Since a status line or header can contain non-utf8 characters the +/// backing store is a `Vec` +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct HeaderLine(Vec); + +impl From for HeaderLine { + fn from(s: String) -> Self { + HeaderLine(s.into_bytes()) + } +} + +impl From> for HeaderLine { + fn from(b: Vec) -> Self { + HeaderLine(b) + } +} + +impl HeaderLine { + pub fn into_string_lossy(self) -> String { + // Try to avoid an extra allcation. + String::from_utf8(self.0) + .unwrap_or_else(|e| String::from_utf8_lossy(&e.into_bytes()).to_string()) + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + fn as_bytes(&self) -> &[u8] { + &self.0 + } + + pub fn into_header(self) -> Result { + // The header name should always be ascii, we can read anything up to the + // ':' delimiter byte-by-byte. + let mut index = 0; + + for c in self.as_bytes() { + if *c == b':' { + break; + } + if !is_tchar(c) { + return Err(Error::new( + ErrorKind::BadHeader, + Some(format!("Invalid char ({:0x?}) while looking for ':'", *c)), + )); + } + index += 1; + } + + Ok(Header { line: self, index }) + } +} + +impl fmt::Display for HeaderLine { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", String::from_utf8_lossy(&self.0)) + } +} #[derive(Clone, PartialEq)] /// Wrapper type for a header field. @@ -8,7 +68,7 @@ use std::str::FromStr; pub struct Header { // Line contains the unmodified bytes of single header field. // It does not contain the final CRLF. - line: String, + line: HeaderLine, // Index is the position of the colon within the header field. // Invariant: index > 0 // Invariant: index + 1 < line.len() @@ -23,19 +83,53 @@ impl fmt::Debug for Header { impl Header { pub fn new(name: &str, value: &str) -> Self { - let line = format!("{}: {}", name, value); + let line = format!("{}: {}", name, value).into(); let index = name.len(); Header { line, index } } /// The header name. pub fn name(&self) -> &str { - &self.line.as_str()[0..self.index] + let bytes = &self.line.as_bytes()[0..self.index]; + // Since we validate the header name in HeaderLine::into_header, we + // are guaranteed it is valid utf-8 at this point. + from_utf8(bytes).expect("Legal chars in header name") } /// The header value. - pub fn value(&self) -> &str { - &self.line.as_str()[self.index + 1..].trim() + /// + /// For non-utf8 headers this returns None (use [`Header::value_raw()`]). + pub fn value(&self) -> Option<&str> { + let bytes = &self.line.as_bytes()[self.index + 1..]; + from_utf8(bytes) + .map(|s| s.trim()) + .ok() + // ensure all bytes are valid field name. + .filter(|s| s.as_bytes().iter().all(is_field_vchar_or_obs_fold)) + } + + /// The header value as a byte slice. + /// + /// For legacy reasons, the HTTP spec allows headers to be non-ascii characters. + /// Typically such headers are encoded in a non-utf8 encoding (such as iso-8859-1). + /// + /// ureq can't know what encoding the header is in, but this function provides + /// an escape hatch for users that need to handle such headers. + pub fn value_raw(&self) -> &[u8] { + let mut bytes = &self.line.as_bytes()[self.index + 1..]; + + if !bytes.is_empty() { + // trim front + while !bytes.is_empty() && bytes[0].is_ascii_whitespace() { + bytes = &bytes[1..]; + } + // trim back + while !bytes.is_empty() && bytes[bytes.len() - 1].is_ascii_whitespace() { + bytes = &bytes[..(bytes.len() - 1)]; + } + } + + bytes } /// Compares the given str to the header name ignoring case. @@ -44,7 +138,11 @@ impl Header { } pub(crate) fn validate(&self) -> Result<(), Error> { - if !valid_name(self.name()) || !valid_value(&self.line.as_str()[self.index + 1..]) { + let bytes = self.line.as_bytes(); + let name_raw = &bytes[0..self.index]; + let value_raw = &bytes[self.index + 1..]; + + if !valid_name(name_raw) || !valid_value(value_raw) { Err(ErrorKind::BadHeader.msg(&format!("invalid header '{}'", self.line))) } else { Ok(()) @@ -53,14 +151,17 @@ impl Header { } pub fn get_header<'a, 'b>(headers: &'b [Header], name: &'a str) -> Option<&'b str> { - headers.iter().find(|h| h.is_name(name)).map(|h| h.value()) + headers + .iter() + .find(|h| h.is_name(name)) + .and_then(|h| h.value()) } pub fn get_all_headers<'a, 'b>(headers: &'b [Header], name: &'a str) -> Vec<&'b str> { headers .iter() .filter(|h| h.is_name(name)) - .map(|h| h.value()) + .filter_map(|h| h.value()) .collect() } @@ -84,12 +185,12 @@ pub fn add_header(headers: &mut Vec
, header: Header) { // token = 1*tchar // tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / // "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA -fn valid_name(name: &str) -> bool { - !name.is_empty() && name.bytes().all(is_tchar) +fn valid_name(name: &[u8]) -> bool { + !name.is_empty() && name.iter().all(is_tchar) } #[inline] -fn is_tchar(b: u8) -> bool { +fn is_tchar(b: &u8) -> bool { match b { b'!' | b'#' | b'$' | b'%' | b'&' => true, b'\'' | b'*' | b'+' | b'-' | b'.' => true, @@ -112,12 +213,12 @@ fn is_tchar(b: u8) -> bool { // https://tools.ietf.org/html/rfc5234#appendix-B.1 // VCHAR = %x21-7E // ; visible (printing) characters -fn valid_value(value: &str) -> bool { - value.bytes().all(is_field_vchar_or_obs_fold) +fn valid_value(value: &[u8]) -> bool { + value.iter().all(is_field_vchar_or_obs_fold) } #[inline] -fn is_field_vchar_or_obs_fold(b: u8) -> bool { +fn is_field_vchar_or_obs_fold(b: &u8) -> bool { match b { b' ' | b'\t' => true, 0x21..=0x7E => true, @@ -129,17 +230,10 @@ impl FromStr for Header { type Err = Error; fn from_str(s: &str) -> Result { // - let line = s.to_string(); - let index = s - .find(':') - .ok_or_else(|| ErrorKind::BadHeader.msg("no colon in header"))?; + let line: HeaderLine = s.to_string().into(); - // no value? - if index >= s.len() { - return Err(ErrorKind::BadHeader.msg("no value in header")); - } + let header = line.into_header()?; - let header = Header { line, index }; header.validate()?; Ok(header) } @@ -151,27 +245,27 @@ mod tests { #[test] fn test_valid_name() { - assert!(valid_name("example")); - assert!(valid_name("Content-Type")); - assert!(valid_name("h-123456789")); - assert!(!valid_name("Content-Type:")); - assert!(!valid_name("Content-Type ")); - assert!(!valid_name(" some-header")); - assert!(!valid_name("\"invalid\"")); - assert!(!valid_name("Gödel")); + assert!(valid_name(b"example")); + assert!(valid_name(b"Content-Type")); + assert!(valid_name(b"h-123456789")); + assert!(!valid_name(b"Content-Type:")); + assert!(!valid_name(b"Content-Type ")); + assert!(!valid_name(b" some-header")); + assert!(!valid_name(b"\"invalid\"")); + assert!(!valid_name(b"G\xf6del")); } #[test] fn test_valid_value() { - assert!(valid_value("example")); - assert!(valid_value("foo bar")); - assert!(valid_value(" foobar ")); - assert!(valid_value(" foo\tbar ")); - assert!(valid_value(" foo~")); - assert!(valid_value(" !bar")); - assert!(valid_value(" ")); - assert!(!valid_value(" \nfoo")); - assert!(!valid_value("foo\x7F")); + assert!(valid_value(b"example")); + assert!(valid_value(b"foo bar")); + assert!(valid_value(b" foobar ")); + assert!(valid_value(b" foo\tbar ")); + assert!(valid_value(b" foo~")); + assert!(valid_value(b" !bar")); + assert!(valid_value(b" ")); + assert!(!valid_value(b" \nfoo")); + assert!(!valid_value(b"foo\x7F")); } #[test] @@ -197,25 +291,46 @@ mod tests { } } + #[test] + #[cfg(feature = "charset")] + fn test_parse_non_utf8_value() { + let (cow, _, _) = encoding_rs::WINDOWS_1252.encode("x-geo-stuff: älvsjö "); + let bytes = cow.to_vec(); + let line: HeaderLine = bytes.into(); + let header = line.into_header().unwrap(); + assert_eq!(header.name(), "x-geo-stuff"); + assert_eq!(header.value(), None); + assert_eq!(header.value_raw(), [228, 108, 118, 115, 106, 246]); + } + #[test] fn empty_value() { let h = "foo:".parse::
().unwrap(); - assert_eq!(h.value(), ""); + assert_eq!(h.value(), Some("")); } #[test] fn value_with_whitespace() { let h = "foo: bar ".parse::
().unwrap(); - assert_eq!(h.value(), "bar"); + assert_eq!(h.value(), Some("bar")); } #[test] fn name_and_value() { let header: Header = "X-Forwarded-For: 127.0.0.1".parse().unwrap(); assert_eq!("X-Forwarded-For", header.name()); - assert_eq!("127.0.0.1", header.value()); + assert_eq!(header.value(), Some("127.0.0.1")); assert!(header.is_name("X-Forwarded-For")); assert!(header.is_name("x-forwarded-for")); assert!(header.is_name("X-FORWARDED-FOR")); } + + #[test] + fn test_iso8859_utf8_mixup() { + // C2 A5 is ¥ in UTF-8 and Â¥ in ISO-8859-1 + let b = "header: \0xc2\0xa5".to_string().into_bytes(); + let l: HeaderLine = b.into(); + let h = l.into_header().unwrap(); + assert_eq!(h.value(), None); + } } diff --git a/src/response.rs b/src/response.rs index e477aad..fc6ca2f 100644 --- a/src/response.rs +++ b/src/response.rs @@ -6,7 +6,7 @@ use chunked_transfer::Decoder as ChunkDecoder; use url::Url; use crate::error::{Error, ErrorKind::BadStatus}; -use crate::header::Header; +use crate::header::{get_all_headers, get_header, Header, HeaderLine}; use crate::pool::PoolReturnRead; use crate::stream::{DeadlineStream, Stream}; use crate::unit::Unit; @@ -119,20 +119,31 @@ impl Response { } /// The status text: `OK` + /// + /// The HTTP spec allows for non-utf8 status texts. This uses from_utf8_lossy to + /// convert such lines to &str. pub fn status_text(&self) -> &str { &self.status_line.as_str()[self.index.response_code + 1..].trim() } - /// The header corresponding header value for the give name, if any. + /// The header value for the given name, or None if not found. + /// + /// For historical reasons, the HTTP spec allows for header values + /// to be encoded using encodigs like iso-8859-1. Such encodings + /// means the values are not possible to interpret as utf-8. + /// + /// In case the header value can't be read as utf-8, this function + /// returns `None` (while the name is visible in [`Response::headers_names()`]). pub fn header(&self, name: &str) -> Option<&str> { - self.headers - .iter() - .find(|h| h.is_name(name)) - .map(|h| h.value()) + get_header(&self.headers, name) } /// A list of the header names in this response. /// Lowercased to be uniform. + /// + /// It's possible for a header name to be returned by this function, and + /// still give a `None` value. See [`Response::header()`] for an explanation + /// as to why. pub fn headers_names(&self) -> Vec { self.headers .iter() @@ -147,11 +158,7 @@ impl Response { /// All headers corresponding values for the give name, or empty vector. pub fn all(&self, name: &str) -> Vec<&str> { - self.headers - .iter() - .filter(|h| h.is_name(name)) - .map(|h| h.value()) - .collect() + get_all_headers(&self.headers, name) } /// The content type part of the "Content-Type" header without @@ -414,8 +421,9 @@ impl Response { // HTTP/1.1 200 OK\r\n let mut stream = stream::DeadlineStream::new(stream, unit.as_ref().and_then(|u| u.deadline)); - let status_line = read_next_line(&mut stream, "the status line")?; + // The status line we can ignore non-utf8 chars and parse as_str_lossy(). + let status_line = read_next_line(&mut stream, "the status line")?.into_string_lossy(); let (index, status) = parse_status_line(status_line.as_str())?; let mut headers: Vec
= Vec::new(); @@ -424,7 +432,7 @@ impl Response { if line.is_empty() { break; } - if let Ok(header) = line.as_str().parse::
() { + if let Ok(header) = line.into_header() { headers.push(header); } } @@ -539,22 +547,13 @@ impl FromStr for Response { } } -fn read_next_line(reader: &mut impl BufRead, context: &str) -> io::Result { - let mut s = String::new(); - let result = reader.read_line(&mut s); +fn read_next_line(reader: &mut impl BufRead, context: &str) -> io::Result { + let mut buf = Vec::new(); + let result = reader.read_until(b'\n', &mut buf); if let Err(e) = result { // Provide context to errors encountered while reading the line. - // ureq does not currently handle non-ascii status lines and - // header values. For historical reasons, the HTTP spec does - // allow for characters in the range 0x80-0xff, but these are - // very rarely encountered in the wild. - // See https://github.com/algesten/ureq/issues/320 - let reason = if e.kind() == io::ErrorKind::InvalidData { - format!("Invalid data in {}", context) - } else { - format!("Error encountered in {}", context) - }; + let reason = format!("Error encountered in {}", context); let kind = e.kind(); @@ -572,17 +571,19 @@ fn read_next_line(reader: &mut impl BufRead, context: &str) -> io::Result io::Result<()> // other headers for header in &unit.headers { if !redir || !header.is_name("Authorization") { - write!(prelude, "{}: {}\r\n", header.name(), header.value())?; + if let Some(v) = header.value() { + write!(prelude, "{}: {}\r\n", header.name(), v)?; + } } }