diff --git a/base/hashing2.jl b/base/hashing2.jl index becf7b9bed07d..a2653ca556a49 100644 --- a/base/hashing2.jl +++ b/base/hashing2.jl @@ -166,3 +166,15 @@ end ## hashing Float16s ## hash(x::Float16, h::UInt) = hash(Float64(x), h) + +## hashing strings ## + +const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed +const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81 + +function hash{T<:ByteString}(s::Union{T,SubString{T}}, h::UInt) + h += memhash_seed + # note: use pointer(s) here (see #6058). + ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), pointer(s), sizeof(s), h % UInt32) + h +end +hash(s::AbstractString, h::UInt) = hash(bytestring(s), h) diff --git a/base/parse.jl b/base/parse.jl new file mode 100644 index 0000000000000..c1f916d980441 --- /dev/null +++ b/base/parse.jl @@ -0,0 +1,199 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +## string to integer functions ## + +function parse{T<:Integer}(::Type{T}, c::Char, base::Integer=36) + a::Int = (base <= 36 ? 10 : 36) + 2 <= base <= 62 || throw(ArgumentError("invalid base: base must be 2 ≤ base ≤ 62, got $base")) + d = '0' <= c <= '9' ? c-'0' : + 'A' <= c <= 'Z' ? c-'A'+10 : + 'a' <= c <= 'z' ? c-'a'+a : throw(ArgumentError("invalid digit: $(repr(c))")) + d < base || throw(ArgumentError("invalid base $base digit $(repr(c))")) + convert(T, d) +end + +function parseint_next(s::AbstractString, startpos::Int, endpos::Int) + (0 < startpos <= endpos) || (return Char(0), 0, 0) + j = startpos + c, startpos = next(s,startpos) + c, startpos, j +end + +function parseint_preamble(signed::Bool, base::Int, s::AbstractString, startpos::Int, endpos::Int) + c, i, j = parseint_next(s, startpos, endpos) + + while isspace(c) + c, i, j = parseint_next(s,i,endpos) + end + (j == 0) && (return 0, 0, 0) + + sgn = 1 + if signed + if c == '-' || c == '+' + (c == '-') && (sgn = -1) + c, i, j = parseint_next(s,i,endpos) + end + end + + while isspace(c) + c, i, j = parseint_next(s,i,endpos) + end + (j == 0) && (return 0, 0, 0) + + if base == 0 + if c == '0' && !done(s,i) + c, i = next(s,i) + base = c=='b' ? 2 : c=='o' ? 8 : c=='x' ? 16 : 10 + if base != 10 + c, i, j = parseint_next(s,i,endpos) + end + else + base = 10 + end + end + return sgn, base, j +end + +function tryparse_internal{S<:ByteString}(::Type{Bool}, sbuff::S, startpos::Int, endpos::Int, raise::Bool) + len = endpos-startpos+1 + p = pointer(sbuff)+startpos-1 + (len == 4) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), p, "true", 4)) && (return Nullable(true)) + (len == 5) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), p, "false", 5)) && (return Nullable(false)) + raise && throw(ArgumentError("invalid Bool representation: $(repr(SubString(s,startpos,endpos)))")) + Nullable{Bool}() +end + +safe_add{T<:Integer}(n1::T, n2::T) = ((n2 > 0) ? (n1 > (typemax(T) - n2)) : (n1 < (typemin(T) - n2))) ? Nullable{T}() : Nullable{T}(n1 + n2) +safe_mul{T<:Integer}(n1::T, n2::T) = ((n2 > 0) ? ((n1 > div(typemax(T),n2)) || (n1 < div(typemin(T),n2))) : + (n2 < -1) ? ((n1 > div(typemin(T),n2)) || (n1 < div(typemax(T),n2))) : + ((n2 == -1) && n1 == typemin(T))) ? Nullable{T}() : Nullable{T}(n1 * n2) + +function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, startpos::Int, endpos::Int, base::Int, a::Int, raise::Bool) + _n = Nullable{T}() + sgn, base, i = parseint_preamble(T<:Signed, base, s, startpos, endpos) + if i == 0 + raise && throw(ArgumentError("premature end of integer: $(repr(SubString(s,startpos,endpos)))")) + return _n + end + c, i = parseint_next(s,i,endpos) + if i == 0 + raise && throw(ArgumentError("premature end of integer: $(repr(SubString(s,startpos,endpos)))")) + return _n + end + + base = convert(T,base) + m::T = div(typemax(T)-base+1,base) + n::T = 0 + while n <= m + d::T = '0' <= c <= '9' ? c-'0' : + 'A' <= c <= 'Z' ? c-'A'+10 : + 'a' <= c <= 'z' ? c-'a'+a : base + if d >= base + raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(SubString(s,startpos,endpos)))")) + return _n + end + n *= base + n += d + if i > endpos + n *= sgn + return Nullable{T}(n) + end + c, i = next(s,i) + isspace(c) && break + end + (T <: Signed) && (n *= sgn) + while !isspace(c) + d::T = '0' <= c <= '9' ? c-'0' : + 'A' <= c <= 'Z' ? c-'A'+10 : + 'a' <= c <= 'z' ? c-'a'+a : base + if d >= base + raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(SubString(s,startpos,endpos)))")) + return _n + end + (T <: Signed) && (d *= sgn) + + safe_n = safe_mul(n, base) + isnull(safe_n) || (safe_n = safe_add(get(safe_n), d)) + if isnull(safe_n) + raise && throw(OverflowError()) + return _n + end + n = get(safe_n) + (i > endpos) && return Nullable{T}(n) + c, i = next(s,i) + end + while i <= endpos + c, i = next(s,i) + if !isspace(c) + raise && throw(ArgumentError("extra characters after whitespace in $(repr(SubString(s,startpos,endpos)))")) + return _n + end + end + return Nullable{T}(n) +end +tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, raise::Bool) = + tryparse_internal(T,s,start(s),endof(s),base,raise) +tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, startpos::Int, endpos::Int, base::Int, raise::Bool) = + tryparse_internal(T, s, startpos, endpos, base, base <= 36 ? 10 : 36, raise) +tryparse{T<:Integer}(::Type{T}, s::AbstractString, base::Int) = + 2 <= base <= 62 ? tryparse_internal(T,s,Int(base),false) : throw(ArgumentError("invalid base: base must be 2 ≤ base ≤ 62, got $base")) +tryparse{T<:Integer}(::Type{T}, s::AbstractString) = tryparse_internal(T,s,0,false) + +function parse{T<:Integer}(::Type{T}, s::AbstractString, base::Integer) + (2 <= base <= 62) || throw(ArgumentError("invalid base: base must be 2 ≤ base ≤ 62, got $base")) + get(tryparse_internal(T, s, base, true)) +end +parse{T<:Integer}(::Type{T}, s::AbstractString) = get(tryparse_internal(T, s, 0, true)) + +## stringifying integers more efficiently ## + +string(x::Union{Int8,Int16,Int32,Int64,Int128}) = dec(x) + +## string to float functions ## + +tryparse(::Type{Float64}, s::ByteString) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s)) +tryparse{T<:ByteString}(::Type{Float64}, s::SubString{T}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof) + +tryparse(::Type{Float32}, s::ByteString) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s)) +tryparse{T<:ByteString}(::Type{Float32}, s::SubString{T}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof) + +tryparse{T<:Union{Float32,Float64}}(::Type{T}, s::AbstractString) = tryparse(T, bytestring(s)) + +function parse{T<:FloatingPoint}(::Type{T}, s::AbstractString) + nf = tryparse(T, s) + isnull(nf) ? throw(ArgumentError("invalid number format $(repr(s)) for $T")) : get(nf) +end + +float(x::AbstractString) = parse(Float64,x) + +float{S<:AbstractString}(a::AbstractArray{S}) = map!(float, similar(a,typeof(float(0))), a) + +## interface to parser ## + +function parse(str::AbstractString, pos::Int; greedy::Bool=true, raise::Bool=true) + # returns (expr, end_pos). expr is () in case of parse error. + bstr = bytestring(str) + ex, pos = ccall(:jl_parse_string, Any, + (Ptr{UInt8}, Csize_t, Int32, Int32), + bstr, sizeof(bstr), pos-1, greedy ? 1:0) + if raise && isa(ex,Expr) && is(ex.head,:error) + throw(ParseError(ex.args[1])) + end + if ex == () + raise && throw(ParseError("end of input")) + ex = Expr(:error, "end of input") + end + ex, pos+1 # C is zero-based, Julia is 1-based +end + +function parse(str::AbstractString; raise::Bool=true) + ex, pos = parse(str, start(str), greedy=true, raise=raise) + if isa(ex,Expr) && ex.head === :error + return ex + end + if !done(str, pos) + raise && throw(ParseError("extra token after end of expression")) + return Expr(:error, "extra token after end of expression") + end + return ex +end diff --git a/base/shell.jl b/base/shell.jl new file mode 100644 index 0000000000000..cfd7eb26ee690 --- /dev/null +++ b/base/shell.jl @@ -0,0 +1,167 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +## shell-like command parsing ## + +function shell_parse(raw::AbstractString, interp::Bool) + s = lstrip(raw) + #Strips the end but respects the space when the string endswith "\\ " + r = RevString(s) + i = start(r) + c_old = nothing + while !done(r,i) + c, j = next(r,i) + if c == '\\' && c_old == ' ' + i -= 1 + break + elseif !(c in _default_delims) + break + end + i = j + c_old = c + end + s = s[1:end-i+1] + + last_parse = 0:-1 + isempty(s) && return interp ? (Expr(:tuple,:()),last_parse) : ([],last_parse) + + in_single_quotes = false + in_double_quotes = false + + args::Vector{Any} = [] + arg::Vector{Any} = [] + i = start(s) + j = i + + function update_arg(x) + if !isa(x,AbstractString) || !isempty(x) + push!(arg, x) + end + end + function append_arg() + if isempty(arg); arg = Any["",]; end + push!(args, arg) + arg = [] + end + + while !done(s,j) + c, k = next(s,j) + if !in_single_quotes && !in_double_quotes && isspace(c) + update_arg(s[i:j-1]) + append_arg() + j = k + while !done(s,j) + c, k = next(s,j) + if !isspace(c) + i = j + break + end + j = k + end + elseif interp && !in_single_quotes && c == '$' + update_arg(s[i:j-1]); i = k; j = k + if done(s,k) + error("\$ right before end of command") + end + if isspace(s[k]) + error("space not allowed right after \$") + end + stpos = j + ex, j = parse(s,j,greedy=false) + last_parse = stpos:j + update_arg(esc(ex)); i = j + else + if !in_double_quotes && c == '\'' + in_single_quotes = !in_single_quotes + update_arg(s[i:j-1]); i = k + elseif !in_single_quotes && c == '"' + in_double_quotes = !in_double_quotes + update_arg(s[i:j-1]); i = k + elseif c == '\\' + if in_double_quotes + if done(s,k) + error("unterminated double quote") + end + if s[k] == '"' || s[k] == '$' + update_arg(s[i:j-1]); i = k + c, k = next(s,k) + end + elseif !in_single_quotes + if done(s,k) + error("dangling backslash") + end + update_arg(s[i:j-1]); i = k + c, k = next(s,k) + end + end + j = k + end + end + + if in_single_quotes; error("unterminated single quote"); end + if in_double_quotes; error("unterminated double quote"); end + + update_arg(s[i:end]) + append_arg() + + if !interp + return (args,last_parse) + end + + # construct an expression + ex = Expr(:tuple) + for arg in args + push!(ex.args, Expr(:tuple, arg...)) + end + (ex,last_parse) +end +shell_parse(s::AbstractString) = shell_parse(s,true) + +function shell_split(s::AbstractString) + parsed = shell_parse(s,false)[1] + args = AbstractString[] + for arg in parsed + push!(args, string(arg...)) + end + args +end + +function print_shell_word(io::IO, word::AbstractString) + if isempty(word) + print(io, "''") + end + has_single = false + has_special = false + for c in word + if isspace(c) || c=='\\' || c=='\'' || c=='"' || c=='$' + has_special = true + if c == '\'' + has_single = true + end + end + end + if !has_special + print(io, word) + elseif !has_single + print(io, '\'', word, '\'') + else + print(io, '"') + for c in word + if c == '"' || c == '$' + print(io, '\\') + end + print(io, c) + end + print(io, '"') + end +end + +function print_shell_escaped(io::IO, cmd::AbstractString, args::AbstractString...) + print_shell_word(io, cmd) + for arg in args + print(io, ' ') + print_shell_word(io, arg) + end +end +print_shell_escaped(io::IO) = nothing + +shell_escape(args::AbstractString...) = sprint(print_shell_escaped, args...) diff --git a/base/string.jl b/base/string.jl index f0d90f4f04162..a4976f67cfa14 100644 --- a/base/string.jl +++ b/base/string.jl @@ -1,1723 +1,7 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license -## core text I/O ## - -print(io::IO, x) = show(io, x) -print(io::IO, xs...) = for x in xs print(io, x) end - -println(io::IO, xs...) = print(io, xs..., '\n') - -print(xs...) = print(STDOUT, xs...) -println(xs...) = println(STDOUT, xs...) - -## core string functions ## - -endof(s::AbstractString) = error("you must implement endof(", typeof(s), ")") -next(s::AbstractString, i::Int) = error("you must implement next(", typeof(s), ",Int)") -next(s::DirectIndexString, i::Int) = (s[i],i+1) -next(s::AbstractString, i::Integer) = next(s,Int(i)) - -## conversion of general objects to strings ## - -function print_to_string(xs...) - # specialized for performance reasons - s = IOBuffer(Array(UInt8,isa(xs[1],AbstractString) ? endof(xs[1]) : 0), true, true) - for x in xs - print(s, x) - end - d = s.data - resize!(d,s.size) - bytestring(d) -end - -string() = "" -string(s::AbstractString) = s -string(xs...) = print_to_string(xs...) - -bytestring() = "" -bytestring(s::Vector{UInt8}) = bytestring(pointer(s),length(s)) -bytestring(s::AbstractString...) = print_to_string(s...) - -function bytestring(p::Union{Ptr{UInt8},Ptr{Int8}}) - p == C_NULL ? throw(ArgumentError("cannot convert NULL to string")) : - ccall(:jl_cstr_to_string, ByteString, (Ptr{UInt8},), p) -end -bytestring(s::Cstring) = bytestring(box(Ptr{Cchar}, unbox(Cstring,s))) - -function bytestring(p::Union{Ptr{UInt8},Ptr{Int8}},len::Integer) - p == C_NULL ? throw(ArgumentError("cannot convert NULL to string")) : - ccall(:jl_pchar_to_string, ByteString, (Ptr{UInt8},Int), p, len) -end - -convert(::Type{Vector{UInt8}}, s::AbstractString) = bytestring(s).data -convert(::Type{Array{UInt8}}, s::AbstractString) = bytestring(s).data -convert(::Type{ByteString}, s::AbstractString) = bytestring(s) -convert(::Type{Vector{Char}}, s::AbstractString) = collect(s) -convert(::Type{Symbol}, s::AbstractString) = symbol(s) - -## generic supplied functions ## - -start(s::AbstractString) = 1 -done(s::AbstractString,i) = (i > endof(s)) -getindex(s::AbstractString, i::Int) = next(s,i)[1] -getindex(s::AbstractString, i::Integer) = s[Int(i)] -getindex(s::AbstractString, x::Real) = s[to_index(x)] -getindex{T<:Integer}(s::AbstractString, r::UnitRange{T}) = s[Int(first(r)):Int(last(r))] -# TODO: handle other ranges with stride ±1 specially? -getindex(s::AbstractString, v::AbstractVector) = - sprint(length(v), io->(for i in v write(io,s[i]) end)) - -symbol(s::AbstractString) = symbol(bytestring(s)) - -print(io::IO, s::AbstractString) = (write(io, s); nothing) -write(io::IO, s::AbstractString) = (len = 0; for c in s; len += write(io, c); end; len) -show(io::IO, s::AbstractString) = print_quoted(io, s) - -sizeof(s::AbstractString) = error("type $(typeof(s)) has no canonical binary representation") - -eltype{T<:AbstractString}(::Type{T}) = Char - -(*)(s1::AbstractString, ss::AbstractString...) = string(s1, ss...) -(^)(s::AbstractString, r::Integer) = repeat(s,r) - -length(s::DirectIndexString) = endof(s) -function length(s::AbstractString) - i = start(s) - if done(s,i) - return 0 - end - n = 1 - while true - c, j = next(s,i) - if done(s,j) - return n - end - n += 1 - i = j - end -end - -isvalid(s::DirectIndexString, i::Integer) = (start(s) <= i <= endof(s)) -function isvalid(s::AbstractString, i::Integer) - i < 1 && return false - done(s,i) && return false - try - next(s,i) - true - catch - false - end -end - -prevind(s::DirectIndexString, i::Integer) = i-1 -prevind(s::AbstractArray , i::Integer) = i-1 -nextind(s::DirectIndexString, i::Integer) = i+1 -nextind(s::AbstractArray , i::Integer) = i+1 - -function prevind(s::AbstractString, i::Integer) - e = endof(s) - if i > e - return e - end - j = i-1 - while j >= 1 - if isvalid(s,j) - return j - end - j -= 1 - end - return 0 # out of range -end - -function nextind(s::AbstractString, i::Integer) - e = endof(s) - if i < 1 - return 1 - end - if i > e - return i+1 - end - for j = i+1:e - if isvalid(s,j) - return j - end - end - next(s,e)[2] # out of range -end - -checkbounds(s::AbstractString, i::Integer) = start(s) <= i <= endof(s) || throw(BoundsError(s, i)) -checkbounds(s::AbstractString, i::Real) = checkbounds(s, to_index(i)) -checkbounds{T<:Integer}(s::AbstractString, r::Range{T}) = isempty(r) || (minimum(r) >= start(s) && maximum(r) <= endof(s)) || throw(BoundsError(s, r)) -checkbounds{T<:Real}(s::AbstractString, I::AbstractArray{T}) = all(i -> checkbounds(s, i), I) - -ind2chr(s::DirectIndexString, i::Integer) = begin checkbounds(s,i); i end -chr2ind(s::DirectIndexString, i::Integer) = begin checkbounds(s,i); i end - -function ind2chr(s::AbstractString, i::Integer) - s[i] # throws error if invalid - j = 1 - k = start(s) - while true - c, l = next(s,k) - if i <= k - return j - end - j += 1 - k = l - end -end - -function chr2ind(s::AbstractString, i::Integer) - i < start(s) && throw(BoundsError(s, i)) - j = 1 - k = start(s) - while true - c, l = next(s,k) - if i == j - return k - end - j += 1 - k = l - end -end - -immutable EachStringIndex{T<:AbstractString} - s::T -end -eachindex(s::AbstractString) = EachStringIndex(s) - -length(e::EachStringIndex) = length(e.s) -start(e::EachStringIndex) = start(e.s) -next(e::EachStringIndex, state) = (state, nextind(e.s, state)) -done(e::EachStringIndex, state) = done(e.s, state) -eltype(e::EachStringIndex) = Int - -typealias Chars Union{Char,AbstractVector{Char},Set{Char}} - -function search(s::AbstractString, c::Chars, i::Integer) - if isempty(c) - return 1 <= i <= nextind(s,endof(s)) ? i : - throw(BoundsError(s, i)) - end - if i < 1 || i > nextind(s,endof(s)) - throw(BoundsError(s, i)) - end - while !done(s,i) - d, j = next(s,i) - if d in c - return i - end - i = j - end - return 0 -end -search(s::AbstractString, c::Chars) = search(s,c,start(s)) - -in(c::Char, s::AbstractString) = (search(s,c)!=0) - -function _searchindex(s, t, i) - if isempty(t) - return 1 <= i <= nextind(s,endof(s)) ? i : - throw(BoundsError(s, i)) - end - t1, j2 = next(t,start(t)) - while true - i = search(s,t1,i) - if i == 0 return 0 end - c, ii = next(s,i) - j = j2; k = ii - matched = true - while !done(t,j) - if done(s,k) - matched = false - break - end - c, k = next(s,k) - d, j = next(t,j) - if c != d - matched = false - break - end - end - if matched - return i - end - i = ii - end -end - -function _search_bloom_mask(c) - UInt64(1) << (c & 63) -end - -function _searchindex(s::Array, t::Array, i) - n = length(t) - m = length(s) - - if n == 0 - return 1 <= i <= m+1 ? max(1, i) : 0 - elseif m == 0 - return 0 - elseif n == 1 - return search(s, t[1], i) - end - - w = m - n - if w < 0 || i - 1 > w - return 0 - end - - bloom_mask = UInt64(0) - skip = n - 1 - tlast = t[end] - for j in 1:n - bloom_mask |= _search_bloom_mask(t[j]) - if t[j] == tlast && j < n - skip = n - j - 1 - end - end - - i -= 1 - while i <= w - if s[i+n] == tlast - # check candidate - j = 0 - while j < n - 1 - if s[i+j+1] != t[j+1] - break - end - j += 1 - end - - # match found - if j == n - 1 - return i+1 - end - - # no match, try to rule out the next character - if i < w && bloom_mask & _search_bloom_mask(s[i+n+1]) == 0 - i += n - else - i += skip - end - elseif i < w - if bloom_mask & _search_bloom_mask(s[i+n+1]) == 0 - i += n - end - end - i += 1 - end - - 0 -end - -typealias ByteArray Union{Vector{UInt8},Vector{Int8}} - -searchindex(s::ByteArray, t::ByteArray, i) = _searchindex(s,t,i) -searchindex(s::AbstractString, t::AbstractString, i::Integer) = _searchindex(s,t,i) -searchindex(s::AbstractString, t::AbstractString) = searchindex(s,t,start(s)) -searchindex(s::AbstractString, c::Char, i::Integer) = _searchindex(s,c,i) -searchindex(s::AbstractString, c::Char) = searchindex(s,c,start(s)) - -function searchindex(s::ByteString, t::ByteString, i::Integer=1) - # Check for fast case of a single byte - # (for multi-byte UTF-8 sequences, use searchindex on byte arrays instead) - if endof(t) == 1 - search(s, t[1], i) - else - searchindex(s.data, t.data, i) - end -end - -function search(s::ByteArray, t::ByteArray, i) - idx = searchindex(s,t,i) - if isempty(t) - idx:idx-1 - else - idx:(idx > 0 ? idx + endof(t) - 1 : -1) - end -end - -function search(s::AbstractString, t::AbstractString, i::Integer=start(s)) - idx = searchindex(s,t,i) - if isempty(t) - idx:idx-1 - else - idx:(idx > 0 ? idx + endof(t) - 1 : -1) - end -end - -function rsearch(s::AbstractString, c::Chars) - j = search(RevString(s), c) - j == 0 && return 0 - endof(s)-j+1 -end - -function rsearch(s::AbstractString, c::Chars, i::Integer) - e = endof(s) - j = search(RevString(s), c, e-i+1) - j == 0 && return 0 - e-j+1 -end - -function _rsearchindex(s, t, i) - if isempty(t) - return 1 <= i <= nextind(s,endof(s)) ? i : - throw(BoundsError(s, i)) - end - t = RevString(t) - rs = RevString(s) - l = endof(s) - t1, j2 = next(t,start(t)) - while true - i = rsearch(s,t1,i) - if i == 0 return 0 end - c, ii = next(rs,l-i+1) - j = j2; k = ii - matched = true - while !done(t,j) - if done(rs,k) - matched = false - break - end - c, k = next(rs,k) - d, j = next(t,j) - if c != d - matched = false - break - end - end - if matched - return nextind(s,l-k+1) - end - i = l-ii+1 - end -end - -function _rsearchindex(s::Array, t::Array, k) - n = length(t) - m = length(s) - - if n == 0 - return 0 <= k <= m ? max(k, 1) : 0 - elseif m == 0 - return 0 - elseif n == 1 - return rsearch(s, t[1], k) - end - - w = m - n - if w < 0 || k <= 0 - return 0 - end - - bloom_mask = UInt64(0) - skip = n - 1 - tfirst = t[1] - for j in n:-1:1 - bloom_mask |= _search_bloom_mask(t[j]) - if t[j] == tfirst && j > 1 - skip = j - 2 - end - end - - i = min(k - n + 1, w + 1) - while i > 0 - if s[i] == tfirst - # check candidate - j = 1 - while j < n - if s[i+j] != t[j+1] - break - end - j += 1 - end - - # match found - if j == n - return i - end - - # no match, try to rule out the next character - if i > 1 && bloom_mask & _search_bloom_mask(s[i-1]) == 0 - i -= n - else - i -= skip - end - elseif i > 1 - if bloom_mask & _search_bloom_mask(s[i-1]) == 0 - i -= n - end - end - i -= 1 - end - - 0 -end - -rsearchindex(s::ByteArray,t::ByteArray,i) = _rsearchindex(s,t,i) -rsearchindex(s::AbstractString, t::AbstractString, i::Integer) = _rsearchindex(s,t,i) -rsearchindex(s::AbstractString, t::AbstractString) = (isempty(s) && isempty(t)) ? 1 : rsearchindex(s,t,endof(s)) - -function rsearchindex(s::ByteString, t::ByteString) - # Check for fast case of a single byte - # (for multi-byte UTF-8 sequences, use rsearchindex instead) - if endof(t) == 1 - rsearch(s, t[1]) - else - _rsearchindex(s.data, t.data, length(s.data)) - end -end - -function rsearchindex(s::ByteString, t::ByteString, i::Integer) - # Check for fast case of a single byte - # (for multi-byte UTF-8 sequences, use rsearchindex instead) - if endof(t) == 1 - rsearch(s, t[1], i) - elseif endof(t) != 0 - _rsearchindex(s.data, t.data, nextind(s, i)-1) - elseif i > sizeof(s) - return 0 - elseif i == 0 - return 1 - else - return i - end -end - -function rsearch(s::ByteArray, t::ByteArray, i::Integer) - idx = rsearchindex(s,t,i) - if isempty(t) - idx:idx-1 - else - idx:(idx > 0 ? idx + endof(t) - 1 : -1) - end -end - -function rsearch(s::AbstractString, t::AbstractString, i::Integer=endof(s)) - idx = rsearchindex(s,t,i) - if isempty(t) - idx:idx-1 - else - idx:(idx > 0 ? idx + endof(t) - 1 : -1) - end -end - -contains(haystack::AbstractString, needle::AbstractString) = searchindex(haystack,needle)!=0 - -in(::AbstractString, ::AbstractString) = error("use contains(x,y) for string containment") - -function cmp(a::AbstractString, b::AbstractString) - if a === b - return 0 - end - i = start(a) - j = start(b) - while !done(a,i) && !done(b,i) - c, i = next(a,i) - d, j = next(b,j) - if c != d - return c < d ? -1 : +1 - end - end - done(a,i) && !done(b,j) ? -1 : - !done(a,i) && done(b,j) ? +1 : 0 -end - -==(a::AbstractString, b::AbstractString) = cmp(a,b) == 0 -isless(a::AbstractString, b::AbstractString) = cmp(a,b) < 0 - -# starts with and ends with predicates - -function startswith(a::AbstractString, b::AbstractString) - i = start(a) - j = start(b) - while !done(a,i) && !done(b,i) - c, i = next(a,i) - d, j = next(b,j) - if c != d return false end - end - done(b,i) -end -startswith(str::AbstractString, chars::Chars) = !isempty(str) && str[start(str)] in chars - -function endswith(a::AbstractString, b::AbstractString) - i = endof(a) - j = endof(b) - a1 = start(a) - b1 = start(b) - while a1 <= i && b1 <= j - c = a[i] - d = b[j] - if c != d return false end - i = prevind(a,i) - j = prevind(b,j) - end - j < b1 -end -endswith(str::AbstractString, chars::Chars) = !isempty(str) && str[end] in chars - -# faster comparisons for byte strings and symbols - -cmp(a::ByteString, b::ByteString) = lexcmp(a.data, b.data) -cmp(a::Symbol, b::Symbol) = Int(sign(ccall(:strcmp, Int32, (Cstring, Cstring), a, b))) - -==(a::ByteString, b::ByteString) = endof(a) == endof(b) && cmp(a,b) == 0 -isless(a::Symbol, b::Symbol) = cmp(a,b) < 0 - -startswith(a::ByteString, b::ByteString) = startswith(a.data, b.data) -startswith(a::Vector{UInt8}, b::Vector{UInt8}) = - (length(a) >= length(b) && ccall(:strncmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0) - -# TODO: fast endswith - -## character column width function ## - -strwidth(s::AbstractString) = (w=0; for c in s; w += charwidth(c); end; w) - -isascii(c::Char) = c < Char(0x80) -isascii(s::AbstractString) = all(isascii, s) -isascii(s::ASCIIString) = true - -## substrings reference original strings ## - -immutable SubString{T<:AbstractString} <: AbstractString - string::T - offset::Int - endof::Int - - function SubString(s::T, i::Int, j::Int) - if i > endof(s) || j i - j -= 1 - end - - o = i-1 - new(s, o, max(0, j-o)) - end - end -end -SubString{T<:AbstractString}(s::T, i::Int, j::Int) = SubString{T}(s, i, j) -SubString(s::SubString, i::Int, j::Int) = SubString(s.string, s.offset+i, s.offset+j) -SubString(s::AbstractString, i::Integer, j::Integer) = SubString(s, Int(i), Int(j)) -SubString(s::AbstractString, i::Integer) = SubString(s, i, endof(s)) - -write{T<:ByteString}(to::AbstractIOBuffer, s::SubString{T}) = - s.endof==0 ? 0 : write_sub(to, s.string.data, s.offset + 1, nextind(s, s.endof) - 1) - -sizeof(s::SubString{ASCIIString}) = s.endof -sizeof(s::SubString{UTF8String}) = s.endof == 0 ? 0 : nextind(s, s.endof) - 1 - -# TODO: length(s::SubString) = ?? -# default implementation will work but it's slow -# can this be delegated efficiently somehow? -# that may require additional string interfaces -length{T<:DirectIndexString}(s::SubString{T}) = endof(s) - -function length(s::SubString{UTF8String}) - return s.endof==0 ? 0 : Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t), - pointer(s), nextind(s, s.endof) - 1)) -end - -function next(s::SubString, i::Int) - if i < 1 || i > s.endof - throw(BoundsError(s, i)) - end - c, i = next(s.string, i+s.offset) - c, i-s.offset -end - -function getindex(s::SubString, i::Int) - if i < 1 || i > s.endof - throw(BoundsError(s, i)) - end - getindex(s.string, i+s.offset) -end - -endof(s::SubString) = s.endof - -function isvalid(s::SubString, i::Integer) - return (start(s) <= i <= endof(s)) && isvalid(s.string, s.offset+i) -end - -isvalid{T<:DirectIndexString}(s::SubString{T}, i::Integer) = (start(s) <= i <= endof(s)) - -ind2chr{T<:DirectIndexString}(s::SubString{T}, i::Integer) = begin checkbounds(s,i); i end -chr2ind{T<:DirectIndexString}(s::SubString{T}, i::Integer) = begin checkbounds(s,i); i end - -nextind(s::SubString, i::Integer) = nextind(s.string, i+s.offset)-s.offset -prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset - -convert{T<:AbstractString}(::Type{SubString{T}}, s::T) = SubString(s, 1, endof(s)) - -bytestring{T <: ByteString}(p::SubString{T}) = bytestring(p.string.data[1+p.offset:p.offset+nextind(p, p.endof)-1]) - -function getindex(s::AbstractString, r::UnitRange{Int}) - if first(r) < 1 || endof(s) < last(r) - throw(BoundsError(s, r)) - end - SubString(s, first(r), last(r)) -end - -isascii(s::SubString{ASCIIString}) = true - -function cmp{T<:ByteString,S<:ByteString}(a::SubString{T}, b::SubString{S}) - na = sizeof(a) - nb = sizeof(b) - c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), - pointer(a), pointer(b), min(na,nb)) - c < 0 ? -1 : c > 0 ? +1 : cmp(na,nb) -end - -## hashing strings ## - -const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed -const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81 - -function hash{T<:ByteString}(s::Union{T,SubString{T}}, h::UInt) - h += memhash_seed - # note: use pointer(s) here (see #6058). - ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), pointer(s), sizeof(s), h % UInt32) + h -end -hash(s::AbstractString, h::UInt) = hash(bytestring(s), h) - -## efficient representation of repeated strings ## - -immutable RepString <: AbstractString - string::AbstractString - repeat::Integer -end - -function endof(s::RepString) - e = endof(s.string) - (next(s.string,e)[2]-1) * (s.repeat-1) + e -end -length(s::RepString) = length(s.string)*s.repeat -sizeof(s::RepString) = sizeof(s.string)*s.repeat - -function next(s::RepString, i::Int) - if i < 1 - throw(BoundsError(s, i)) - end - e = endof(s.string) - sz = next(s.string,e)[2]-1 - - r, j = divrem(i-1, sz) - j += 1 - - if r >= s.repeat || j > e - throw(BoundsError(s, i)) - end - - c, k = next(s.string, j) - c, k-j+i -end - -function repeat(s::AbstractString, r::Integer) - r < 0 ? throw(ArgumentError("can't repeat a string $r times")) : - r == 0 ? "" : - r == 1 ? s : - RepString(s,r) -end - -convert(::Type{RepString}, s::AbstractString) = RepString(s,1) - -function repeat(s::ByteString, r::Integer) - r < 0 && throw(ArgumentError("can't repeat a string $r times")) - d = s.data; n = length(d) - out = Array(UInt8, n*r) - for i=1:r - copy!(out, 1+(i-1)*n, d, 1, n) - end - convert(typeof(s), out) -end - -## reversed strings without data movement ## - -immutable RevString{T<:AbstractString} <: AbstractString - string::T -end - -endof(s::RevString) = endof(s.string) -length(s::RevString) = length(s.string) -sizeof(s::RevString) = sizeof(s.string) - -function next(s::RevString, i::Int) - n = endof(s); j = n-i+1 - (s.string[j], n-prevind(s.string,j)+1) -end - -reverse(s::AbstractString) = RevString(s) -reverse(s::RevString) = s.string - -isascii(s::RevString{ASCIIString}) = true - -## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)] - -reverseind(s::Union{DirectIndexString,SubString{DirectIndexString}}, i::Integer) = length(s) + 1 - i -reverseind(s::RevString, i::Integer) = endof(s) - i + 1 -lastidx(s::AbstractString) = nextind(s, endof(s)) - 1 -lastidx(s::DirectIndexString) = length(s) -reverseind(s::SubString, i::Integer) = - reverseind(s.string, lastidx(s.string)-s.offset-s.endof+i) - s.offset - -## ropes for efficient concatenation, etc. ## - -immutable RopeString <: AbstractString - head::AbstractString - tail::AbstractString - depth::Int32 - endof::Int - - RopeString(h::RopeString, t::RopeString) = - strdepth(h.tail) + strdepth(t) < strdepth(h.head) ? - RopeString(h.head, RopeString(h.tail, t)) : - new(h, t, max(h.depth,t.depth)+1, endof(h)+endof(t)) - - RopeString(h::RopeString, t::AbstractString) = - strdepth(h.tail) < strdepth(h.head) ? - RopeString(h.head, RopeString(h.tail, t)) : - new(h, t, h.depth+1, endof(h)+endof(t)) - - RopeString(h::AbstractString, t::RopeString) = - strdepth(t.head) < strdepth(t.tail) ? - RopeString(RopeString(h, t.head), t.tail) : - new(h, t, t.depth+1, endof(h)+endof(t)) - - RopeString(h::AbstractString, t::AbstractString) = - new(h, t, 1, endof(h)+endof(t)) -end -RopeString(s::AbstractString) = RopeString(s,"") - -strdepth(s::AbstractString) = 0 -strdepth(s::RopeString) = s.depth - -function next(s::RopeString, i::Int) - eh = endof(s.head) - if i <= eh - return next(s.head, i) - else - c, j = next(s.tail, i-eh) - return c, j+eh - end -end - -endof(s::RopeString) = s.endof -length(s::RopeString) = length(s.head) + length(s.tail) -write(io::IO, s::RopeString) = (write(io, s.head); write(io, s.tail)) -sizeof(s::RopeString) = sizeof(s.head) + sizeof(s.tail) - -## uppercase and lowercase transformations ## -uppercase(s::AbstractString) = map(uppercase, s) -lowercase(s::AbstractString) = map(lowercase, s) - -function ucfirst(s::AbstractString) - isempty(s) || isupper(s[1]) ? s : string(uppercase(s[1]),s[nextind(s,1):end]) -end -function lcfirst(s::AbstractString) - isempty(s) || islower(s[1]) ? s : string(lowercase(s[1]),s[nextind(s,1):end]) -end - -## string map, filter, has ## - -map_result(s::AbstractString, a::Vector{UInt8}) = UTF8String(a) -map_result(s::Union{ASCIIString,SubString{ASCIIString}}, a::Vector{UInt8}) = bytestring(a) - -function map(f, s::AbstractString) - out = IOBuffer(Array(UInt8,endof(s)),true,true) - truncate(out,0) - for c in s - c2 = f(c) - if !isa(c2,Char) - throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead")) - end - write(out, c2::Char) - end - map_result(s, takebuf_array(out)) -end - -function filter(f, s::AbstractString) - out = IOBuffer(Array(UInt8,endof(s)),true,true) - truncate(out,0) - for c in s - if f(c) - write(out, c) - end - end - takebuf_string(out) -end - -## string promotion rules ## - -promote_rule{S<:AbstractString,T<:AbstractString}(::Type{S}, ::Type{T}) = UTF8String - -## printing literal quoted string data ## - -# this is the inverse of print_unescaped_chars(io, s, "\\\") - -function print_quoted_literal(io, s::AbstractString) - print(io, '"') - for c = s; c == '"' ? print(io, "\\\"") : print(io, c); end - print(io, '"') -end - -## string escaping & unescaping ## - -escape_nul(s::AbstractString, i::Int) = - !done(s,i) && '0' <= next(s,i)[1] <= '7' ? "\\x00" : "\\0" - -isxdigit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F' -isxdigit(s::AbstractString) = all(isxdigit, s) -need_full_hex(s::AbstractString, i::Int) = !done(s,i) && isxdigit(next(s,i)[1]) - -function print_escaped(io, s::AbstractString, esc::AbstractString) - i = start(s) - while !done(s,i) - c, j = next(s,i) - c == '\0' ? print(io, escape_nul(s,j)) : - c == '\e' ? print(io, "\\e") : - c == '\\' ? print(io, "\\\\") : - c in esc ? print(io, '\\', c) : - '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : - isprint(c) ? print(io, c) : - c <= '\x7f' ? print(io, "\\x", hex(c, 2)) : - c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : - print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4)) - i = j - end -end - -escape_string(s::AbstractString) = sprint(endof(s), print_escaped, s, "\"") -function print_quoted(io, s::AbstractString) - print(io, '"') - print_escaped(io, s, "\"\$") #"# work around syntax highlighting problem - print(io, '"') -end - -# bare minimum unescaping function unescapes only given characters - -function print_unescaped_chars(io, s::AbstractString, esc::AbstractString) - if !('\\' in esc) - esc = string("\\", esc) - end - i = start(s) - while !done(s,i) - c, i = next(s,i) - if c == '\\' && !done(s,i) && s[i] in esc - c, i = next(s,i) - end - print(io, c) - end -end - -unescape_chars(s::AbstractString, esc::AbstractString) = - sprint(endof(s), print_unescaped_chars, s, esc) - -# general unescaping of traditional C and Unicode escape sequences - -function print_unescaped(io, s::AbstractString) - i = start(s) - while !done(s,i) - c, i = next(s,i) - if !done(s,i) && c == '\\' - c, i = next(s,i) - if c == 'x' || c == 'u' || c == 'U' - n = k = 0 - m = c == 'x' ? 2 : - c == 'u' ? 4 : 8 - while (k+=1) <= m && !done(s,i) - c, j = next(s,i) - n = '0' <= c <= '9' ? n<<4 + c-'0' : - 'a' <= c <= 'f' ? n<<4 + c-'a'+10 : - 'A' <= c <= 'F' ? n<<4 + c-'A'+10 : break - i = j - end - if k == 1 - throw(ArgumentError("\\x used with no following hex digits in $(repr(s))")) - end - if m == 2 # \x escape sequence - write(io, UInt8(n)) - else - print(io, Char(n)) - end - elseif '0' <= c <= '7' - k = 1 - n = c-'0' - while (k+=1) <= 3 && !done(s,i) - c, j = next(s,i) - n = ('0' <= c <= '7') ? n<<3 + c-'0' : break - i = j - end - if n > 255 - throw(ArgumentError("octal escape sequence out of range")) - end - write(io, UInt8(n)) - else - print(io, c == 'a' ? '\a' : - c == 'b' ? '\b' : - c == 't' ? '\t' : - c == 'n' ? '\n' : - c == 'v' ? '\v' : - c == 'f' ? '\f' : - c == 'r' ? '\r' : - c == 'e' ? '\e' : c) - end - else - print(io, c) - end - end -end - -unescape_string(s::AbstractString) = sprint(endof(s), print_unescaped, s) - -## checking UTF-8 & ACSII validity ## - -byte_string_classify(data::Vector{UInt8}) = - ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), data, length(data)) -byte_string_classify(s::ByteString) = byte_string_classify(s.data) - # 0: neither valid ASCII nor UTF-8 - # 1: valid ASCII - # 2: valid UTF-8 - -isvalid(::Type{ASCIIString}, s::Union{Vector{UInt8},ByteString}) = byte_string_classify(s) == 1 -isvalid(::Type{UTF8String}, s::Union{Vector{UInt8},ByteString}) = byte_string_classify(s) != 0 - -## multiline strings ## - -function blank_width(c::Char) - c == ' ' ? 1 : - c == '\t' ? 8 : - throw(ArgumentError("$(repr(c)) not a blank character")) -end - -# width of leading blank space, also check if string is blank -function indentation(s::AbstractString) - count = 0 - for c in s - if c == ' ' || c == '\t' - count += blank_width(c) - else - return count, false - end - end - count, true -end - -function unindent(s::AbstractString, indent::Int) - indent == 0 && return s - buf = IOBuffer(Array(UInt8,endof(s)), true, true) - truncate(buf,0) - a = i = start(s) - cutting = false - cut = 0 - while !done(s,i) - c,i_ = next(s,i) - if cutting && (c == ' ' || c == '\t') - a = i_ - cut += blank_width(c) - if cut == indent - cutting = false - elseif cut > indent - cutting = false - for _ = (indent+1):cut write(buf, ' ') end - end - elseif c == '\n' - print(buf, s[a:i]) - a = i_ - cutting = true - cut = 0 - else - cutting = false - end - i = i_ - end - print(buf, s[a:end]) - takebuf_string(buf) -end - -## core string macros ## - -macro b_str(s); :($(unescape_string(s)).data); end - -## shell-like command parsing ## - -function shell_parse(raw::AbstractString, interp::Bool) - s = lstrip(raw) - #Strips the end but respects the space when the string endswith "\\ " - r = RevString(s) - i = start(r) - c_old = nothing - while !done(r,i) - c, j = next(r,i) - if c == '\\' && c_old == ' ' - i -= 1 - break - elseif !(c in _default_delims) - break - end - i = j - c_old = c - end - s = s[1:end-i+1] - - last_parse = 0:-1 - isempty(s) && return interp ? (Expr(:tuple,:()),last_parse) : ([],last_parse) - - in_single_quotes = false - in_double_quotes = false - - args::Vector{Any} = [] - arg::Vector{Any} = [] - i = start(s) - j = i - - function update_arg(x) - if !isa(x,AbstractString) || !isempty(x) - push!(arg, x) - end - end - function append_arg() - if isempty(arg); arg = Any["",]; end - push!(args, arg) - arg = [] - end - - while !done(s,j) - c, k = next(s,j) - if !in_single_quotes && !in_double_quotes && isspace(c) - update_arg(s[i:j-1]) - append_arg() - j = k - while !done(s,j) - c, k = next(s,j) - if !isspace(c) - i = j - break - end - j = k - end - elseif interp && !in_single_quotes && c == '$' - update_arg(s[i:j-1]); i = k; j = k - if done(s,k) - error("\$ right before end of command") - end - if isspace(s[k]) - error("space not allowed right after \$") - end - stpos = j - ex, j = parse(s,j,greedy=false) - last_parse = stpos:j - update_arg(esc(ex)); i = j - else - if !in_double_quotes && c == '\'' - in_single_quotes = !in_single_quotes - update_arg(s[i:j-1]); i = k - elseif !in_single_quotes && c == '"' - in_double_quotes = !in_double_quotes - update_arg(s[i:j-1]); i = k - elseif c == '\\' - if in_double_quotes - if done(s,k) - error("unterminated double quote") - end - if s[k] == '"' || s[k] == '$' - update_arg(s[i:j-1]); i = k - c, k = next(s,k) - end - elseif !in_single_quotes - if done(s,k) - error("dangling backslash") - end - update_arg(s[i:j-1]); i = k - c, k = next(s,k) - end - end - j = k - end - end - - if in_single_quotes; error("unterminated single quote"); end - if in_double_quotes; error("unterminated double quote"); end - - update_arg(s[i:end]) - append_arg() - - if !interp - return (args,last_parse) - end - - # construct an expression - ex = Expr(:tuple) - for arg in args - push!(ex.args, Expr(:tuple, arg...)) - end - (ex,last_parse) -end -shell_parse(s::AbstractString) = shell_parse(s,true) - -function shell_split(s::AbstractString) - parsed = shell_parse(s,false)[1] - args = AbstractString[] - for arg in parsed - push!(args, string(arg...)) - end - args -end - -function print_shell_word(io::IO, word::AbstractString) - if isempty(word) - print(io, "''") - end - has_single = false - has_special = false - for c in word - if isspace(c) || c=='\\' || c=='\'' || c=='"' || c=='$' - has_special = true - if c == '\'' - has_single = true - end - end - end - if !has_special - print(io, word) - elseif !has_single - print(io, '\'', word, '\'') - else - print(io, '"') - for c in word - if c == '"' || c == '$' - print(io, '\\') - end - print(io, c) - end - print(io, '"') - end -end - -function print_shell_escaped(io::IO, cmd::AbstractString, args::AbstractString...) - print_shell_word(io, cmd) - for arg in args - print(io, ' ') - print_shell_word(io, arg) - end -end -print_shell_escaped(io::IO) = nothing - -shell_escape(args::AbstractString...) = sprint(print_shell_escaped, args...) - -## interface to parser ## - -function parse(str::AbstractString, pos::Int; greedy::Bool=true, raise::Bool=true) - # returns (expr, end_pos). expr is () in case of parse error. - bstr = bytestring(str) - ex, pos = ccall(:jl_parse_string, Any, - (Ptr{UInt8}, Csize_t, Int32, Int32), - bstr, sizeof(bstr), pos-1, greedy ? 1:0) - if raise && isa(ex,Expr) && is(ex.head,:error) - throw(ParseError(ex.args[1])) - end - if ex == () - raise && throw(ParseError("end of input")) - ex = Expr(:error, "end of input") - end - ex, pos+1 # C is zero-based, Julia is 1-based -end - -function parse(str::AbstractString; raise::Bool=true) - ex, pos = parse(str, start(str), greedy=true, raise=raise) - if isa(ex,Expr) && ex.head === :error - return ex - end - if !done(str, pos) - raise && throw(ParseError("extra token after end of expression")) - return Expr(:error, "extra token after end of expression") - end - return ex -end - -## miscellaneous string functions ## - -function lpad(s::AbstractString, n::Integer, p::AbstractString=" ") - m = n - strwidth(s) - if m <= 0; return s; end - l = strwidth(p) - if l==1 - return bytestring(p^m * s) - end - q = div(m,l) - r = m - q*l - i = r != 0 ? chr2ind(p, r) : -1 - bytestring(p^q*p[1:i]*s) -end - -function rpad(s::AbstractString, n::Integer, p::AbstractString=" ") - m = n - strwidth(s) - if m <= 0; return s; end - l = strwidth(p) - if l==1 - return bytestring(s * p^m) - end - q = div(m,l) - r = m - q*l - i = r != 0 ? chr2ind(p, r) : -1 - bytestring(s*p^q*p[1:i]) -end - -lpad(s, n::Integer, p=" ") = lpad(string(s),n,string(p)) -rpad(s, n::Integer, p=" ") = rpad(string(s),n,string(p)) -cpad(s, n::Integer, p=" ") = rpad(lpad(s,div(n+strwidth(s),2),p),n,p) - - -# splitter can be a Char, Vector{Char}, AbstractString, Regex, ... -# any splitter that provides search(s::AbstractString, splitter) -split{T<:SubString}(str::T, splitter; limit::Integer=0, keep::Bool=true) = _split(str, splitter, limit, keep, T[]) -split{T<:AbstractString}(str::T, splitter; limit::Integer=0, keep::Bool=true) = _split(str, splitter, limit, keep, SubString{T}[]) -function _split{T<:AbstractString,U<:Array}(str::T, splitter, limit::Integer, keep_empty::Bool, strs::U) - i = start(str) - n = endof(str) - r = search(str,splitter,i) - j, k = first(r), nextind(str,last(r)) - while 0 < j <= n && length(strs) != limit-1 - if i < k - if keep_empty || i < j - push!(strs, SubString(str,i,prevind(str,j))) - end - i = k - end - if k <= j; k = nextind(str,j) end - r = search(str,splitter,k) - j, k = first(r), nextind(str,last(r)) - end - if keep_empty || !done(str,i) - push!(strs, SubString(str,i)) - end - return strs -end - -# a bit oddball, but standard behavior in Perl, Ruby & Python: -const _default_delims = [' ','\t','\n','\v','\f','\r'] -split(str::AbstractString) = split(str, _default_delims; limit=0, keep=false) - -rsplit{T<:SubString}(str::T, splitter; limit::Integer=0, keep::Bool=true) = _rsplit(str, splitter, limit, keep, T[]) -rsplit{T<:AbstractString}(str::T, splitter ; limit::Integer=0, keep::Bool=true) = _rsplit(str, splitter, limit, keep, SubString{T}[]) -function _rsplit{T<:AbstractString,U<:Array}(str::T, splitter, limit::Integer, keep_empty::Bool, strs::U) - i = start(str) - n = endof(str) - r = rsearch(str,splitter) - j = first(r)-1 - k = last(r) - while((0 <= j < n) && (length(strs) != limit-1)) - if i <= k - (keep_empty || (k < n)) && unshift!(strs, SubString(str,k+1,n)) - n = j - end - (k <= j) && (j = prevind(str,j)) - r = rsearch(str,splitter,j) - j = first(r)-1 - k = last(r) - end - (keep_empty || (n > 0)) && unshift!(strs, SubString(str,1,n)) - return strs -end -#rsplit(str::AbstractString) = rsplit(str, _default_delims, 0, false) - -_replacement(repl, str, j, k) = repl -_replacement(repl::Function, str, j, k) = repl(SubString(str, j, k)) - -function replace(str::ByteString, pattern, repl, limit::Integer) - n = 1 - e = endof(str) - i = a = start(str) - r = search(str,pattern,i) - j, k = first(r), last(r) - out = IOBuffer() - while j != 0 - if i == a || i <= k - write_sub(out, str.data, i, j-i) - write(out, _replacement(repl, str, j, k)) - end - if k e - break - end - r = search(str,pattern,k) - j, k = first(r), last(r) - n == limit && break - n += 1 - end - write(out, SubString(str,i)) - takebuf_string(out) -end -replace(s::AbstractString, pat, f, n::Integer) = replace(bytestring(s), pat, f, n) -replace(s::AbstractString, pat, r) = replace(s, pat, r, 0) - -function print_joined(io, strings, delim, last) - i = start(strings) - if done(strings,i) - return - end - str, i = next(strings,i) - print(io, str) - is_done = done(strings,i) - while !is_done - str, i = next(strings,i) - is_done = done(strings,i) - print(io, is_done ? last : delim) - print(io, str) - end -end - -function print_joined(io, strings, delim) - i = start(strings) - is_done = done(strings,i) - while !is_done - str, i = next(strings,i) - is_done = done(strings,i) - print(io, str) - if !is_done - print(io, delim) - end - end -end -print_joined(io, strings) = print_joined(io, strings, "") - -join(args...) = sprint(print_joined, args...) - -chop(s::AbstractString) = s[1:end-1] - -function chomp(s::AbstractString) - i = endof(s) - if (i < 1 || s[i] != '\n') return s end - j = prevind(s,i) - if (j < 1 || s[j] != '\r') return s[1:i-1] end - return s[1:j-1] -end -chomp(s::ByteString) = - (endof(s) < 1 || s.data[end] != 0x0a) ? s : - (endof(s) < 2 || s.data[end-1] != 0x0d) ? s[1:end-1] : s[1:end-2] - -# NOTE: use with caution -- breaks the immutable string convention! -function chomp!(s::ByteString) - if !isempty(s) && s.data[end] == 0x0a - n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2 - ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n) - end - return s -end -chomp!(s::AbstractString) = chomp(s) # copying fallback for other string types - -function lstrip(s::AbstractString, chars::Chars=_default_delims) - i = start(s) - while !done(s,i) - c, j = next(s,i) - if !(c in chars) - return s[i:end] - end - i = j - end - "" -end - -function rstrip(s::AbstractString, chars::Chars=_default_delims) - r = RevString(s) - i = start(r) - while !done(r,i) - c, j = next(r,i) - if !(c in chars) - return s[1:end-i+1] - end - i = j - end - "" -end - -strip(s::AbstractString) = lstrip(rstrip(s)) -strip(s::AbstractString, chars::Chars) = lstrip(rstrip(s, chars), chars) - -## string to integer functions ## - -function parse{T<:Integer}(::Type{T}, c::Char, base::Integer=36) - a::Int = (base <= 36 ? 10 : 36) - 2 <= base <= 62 || throw(ArgumentError("invalid base: base must be 2 ≤ base ≤ 62, got $base")) - d = '0' <= c <= '9' ? c-'0' : - 'A' <= c <= 'Z' ? c-'A'+10 : - 'a' <= c <= 'z' ? c-'a'+a : throw(ArgumentError("invalid digit: $(repr(c))")) - d < base || throw(ArgumentError("invalid base $base digit $(repr(c))")) - convert(T, d) -end - -function parseint_next(s::AbstractString, startpos::Int, endpos::Int) - (0 < startpos <= endpos) || (return Char(0), 0, 0) - j = startpos - c, startpos = next(s,startpos) - c, startpos, j -end - -function parseint_preamble(signed::Bool, base::Int, s::AbstractString, startpos::Int, endpos::Int) - c, i, j = parseint_next(s, startpos, endpos) - - while isspace(c) - c, i, j = parseint_next(s,i,endpos) - end - (j == 0) && (return 0, 0, 0) - - sgn = 1 - if signed - if c == '-' || c == '+' - (c == '-') && (sgn = -1) - c, i, j = parseint_next(s,i,endpos) - end - end - - while isspace(c) - c, i, j = parseint_next(s,i,endpos) - end - (j == 0) && (return 0, 0, 0) - - if base == 0 - if c == '0' && !done(s,i) - c, i = next(s,i) - base = c=='b' ? 2 : c=='o' ? 8 : c=='x' ? 16 : 10 - if base != 10 - c, i, j = parseint_next(s,i,endpos) - end - else - base = 10 - end - end - return sgn, base, j -end - -function tryparse_internal{S<:ByteString}(::Type{Bool}, sbuff::S, startpos::Int, endpos::Int, raise::Bool) - len = endpos-startpos+1 - p = pointer(sbuff)+startpos-1 - (len == 4) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), p, "true", 4)) && (return Nullable(true)) - (len == 5) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), p, "false", 5)) && (return Nullable(false)) - raise && throw(ArgumentError("invalid Bool representation: $(repr(SubString(s,startpos,endpos)))")) - Nullable{Bool}() -end - -safe_add{T<:Integer}(n1::T, n2::T) = ((n2 > 0) ? (n1 > (typemax(T) - n2)) : (n1 < (typemin(T) - n2))) ? Nullable{T}() : Nullable{T}(n1 + n2) -safe_mul{T<:Integer}(n1::T, n2::T) = ((n2 > 0) ? ((n1 > div(typemax(T),n2)) || (n1 < div(typemin(T),n2))) : - (n2 < -1) ? ((n1 > div(typemin(T),n2)) || (n1 < div(typemax(T),n2))) : - ((n2 == -1) && n1 == typemin(T))) ? Nullable{T}() : Nullable{T}(n1 * n2) - -function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, startpos::Int, endpos::Int, base::Int, a::Int, raise::Bool) - _n = Nullable{T}() - sgn, base, i = parseint_preamble(T<:Signed, base, s, startpos, endpos) - if i == 0 - raise && throw(ArgumentError("premature end of integer: $(repr(SubString(s,startpos,endpos)))")) - return _n - end - c, i = parseint_next(s,i,endpos) - if i == 0 - raise && throw(ArgumentError("premature end of integer: $(repr(SubString(s,startpos,endpos)))")) - return _n - end - - base = convert(T,base) - m::T = div(typemax(T)-base+1,base) - n::T = 0 - while n <= m - d::T = '0' <= c <= '9' ? c-'0' : - 'A' <= c <= 'Z' ? c-'A'+10 : - 'a' <= c <= 'z' ? c-'a'+a : base - if d >= base - raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(SubString(s,startpos,endpos)))")) - return _n - end - n *= base - n += d - if i > endpos - n *= sgn - return Nullable{T}(n) - end - c, i = next(s,i) - isspace(c) && break - end - (T <: Signed) && (n *= sgn) - while !isspace(c) - d::T = '0' <= c <= '9' ? c-'0' : - 'A' <= c <= 'Z' ? c-'A'+10 : - 'a' <= c <= 'z' ? c-'a'+a : base - if d >= base - raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(SubString(s,startpos,endpos)))")) - return _n - end - (T <: Signed) && (d *= sgn) - - safe_n = safe_mul(n, base) - isnull(safe_n) || (safe_n = safe_add(get(safe_n), d)) - if isnull(safe_n) - raise && throw(OverflowError()) - return _n - end - n = get(safe_n) - (i > endpos) && return Nullable{T}(n) - c, i = next(s,i) - end - while i <= endpos - c, i = next(s,i) - if !isspace(c) - raise && throw(ArgumentError("extra characters after whitespace in $(repr(SubString(s,startpos,endpos)))")) - return _n - end - end - return Nullable{T}(n) -end -tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, raise::Bool) = - tryparse_internal(T,s,start(s),endof(s),base,raise) -tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, startpos::Int, endpos::Int, base::Int, raise::Bool) = - tryparse_internal(T, s, startpos, endpos, base, base <= 36 ? 10 : 36, raise) -tryparse{T<:Integer}(::Type{T}, s::AbstractString, base::Int) = - 2 <= base <= 62 ? tryparse_internal(T,s,Int(base),false) : throw(ArgumentError("invalid base: base must be 2 ≤ base ≤ 62, got $base")) -tryparse{T<:Integer}(::Type{T}, s::AbstractString) = tryparse_internal(T,s,0,false) - -function parse{T<:Integer}(::Type{T}, s::AbstractString, base::Integer) - (2 <= base <= 62) || throw(ArgumentError("invalid base: base must be 2 ≤ base ≤ 62, got $base")) - get(tryparse_internal(T, s, base, true)) -end -parse{T<:Integer}(::Type{T}, s::AbstractString) = get(tryparse_internal(T, s, 0, true)) - -## stringifying integers more efficiently ## - -string(x::Union{Int8,Int16,Int32,Int64,Int128}) = dec(x) - -## string to float functions ## - -tryparse(::Type{Float64}, s::ByteString) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s)) -tryparse{T<:ByteString}(::Type{Float64}, s::SubString{T}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof) - -tryparse(::Type{Float32}, s::ByteString) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s)) -tryparse{T<:ByteString}(::Type{Float32}, s::SubString{T}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof) - -tryparse{T<:Union{Float32,Float64}}(::Type{T}, s::AbstractString) = tryparse(T, bytestring(s)) - -function parse{T<:FloatingPoint}(::Type{T}, s::AbstractString) - nf = tryparse(T, s) - isnull(nf) ? throw(ArgumentError("invalid number format $(repr(s)) for $T")) : get(nf) -end - -float(x::AbstractString) = parse(Float64,x) - -float{S<:AbstractString}(a::AbstractArray{S}) = map!(float, similar(a,typeof(float(0))), a) - -# find the index of the first occurrence of a value in a byte array - -function search(a::ByteArray, b::Union{Int8,UInt8}, i::Integer) - if i < 1 - throw(BoundsError(a, i)) - end - n = length(a) - if i > n - return i == n+1 ? 0 : throw(BoundsError(a, i)) - end - p = pointer(a) - q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1) - q == C_NULL ? 0 : Int(q-p+1) -end -function search(a::ByteArray, b::Char, i::Integer) - if isascii(b) - search(a,UInt8(b),i) - else - search(a,string(b).data,i).start - end -end -search(a::ByteArray, b::Union{Int8,UInt8,Char}) = search(a,b,1) - -function rsearch(a::ByteArray, b::Union{Int8,UInt8}, i::Integer) - if i < 1 - return i == 0 ? 0 : throw(BoundsError(a, i)) - end - n = length(a) - if i > n - return i == n+1 ? 0 : throw(BoundsError(a, i)) - end - p = pointer(a) - q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i) - q == C_NULL ? 0 : Int(q-p+1) -end -function rsearch(a::ByteArray, b::Char, i::Integer) - if isascii(b) - rsearch(a,UInt8(b),i) - else - rsearch(a,string(b).data,i).start - end -end -rsearch(a::ByteArray, b::Union{Int8,UInt8,Char}) = rsearch(a,b,length(a)) - -function hex2bytes(s::ASCIIString) - len = length(s) - iseven(len) || throw(ArgumentError("string length must be even: length($(repr(s))) == $len")) - arr = zeros(UInt8, div(len,2)) - i = j = 0 - while i < len - n = 0 - c = s[i+=1] - n = '0' <= c <= '9' ? c - '0' : - 'a' <= c <= 'f' ? c - 'a' + 10 : - 'A' <= c <= 'F' ? c - 'A' + 10 : - throw(ArgumentError("not a hexadecimal string: $(repr(s))")) - c = s[i+=1] - n = '0' <= c <= '9' ? n << 4 + c - '0' : - 'a' <= c <= 'f' ? n << 4 + c - 'a' + 10 : - 'A' <= c <= 'F' ? n << 4 + c - 'A' + 10 : - throw(ArgumentError("not a hexadecimal string: $(repr(s))")) - arr[j+=1] = n - end - return arr -end - -bytes2hex{T<:UInt8}(arr::Vector{T}) = join([hex(i,2) for i in arr]) - -function repr(x) - s = IOBuffer() - showall(s, x) - takebuf_string(s) -end - -containsnul(s::AbstractString) = '\0' in s -containsnul(s::ByteString) = containsnul(unsafe_convert(Ptr{Cchar}, s), sizeof(s)) -containsnul(s::Union{UTF16String,UTF32String}) = findfirst(s.data, 0) != length(s.data) - -if sizeof(Cwchar_t) == 2 - const WString = UTF16String - const wstring = utf16 -elseif sizeof(Cwchar_t) == 4 - const WString = UTF32String - const wstring = utf32 -end -wstring(s::Cwstring) = wstring(box(Ptr{Cwchar_t}, unbox(Cwstring,s))) - -# Cwstring is defined in c.jl, but conversion needs to be defined here -# to have WString -function unsafe_convert(::Type{Cwstring}, s::WString) - if containsnul(s) - throw(ArgumentError("embedded NUL chars are not allowed in C strings: $(repr(s))")) - end - return Cwstring(unsafe_convert(Ptr{Cwchar_t}, s)) -end - -# pointer conversions of ASCII/UTF8/UTF16/UTF32 strings: -pointer(x::Union{ByteString,UTF16String,UTF32String}) = pointer(x.data) -pointer{T<:ByteString}(x::SubString{T}) = pointer(x.string.data) + x.offset -pointer(x::ByteString, i::Integer) = pointer(x.data)+(i-1) -pointer{T<:ByteString}(x::SubString{T}, i::Integer) = pointer(x.string.data) + x.offset + (i-1) -pointer(x::Union{UTF16String,UTF32String}, i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data)) -pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.data)) -pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.data)) - -# IOBuffer views of a (byte)string: -IOBuffer(str::ByteString) = IOBuffer(str.data) -IOBuffer{T<:ByteString}(s::SubString{T}) = IOBuffer(sub(s.string.data, s.offset + 1 : s.offset + sizeof(s))) +include("strings/types.jl") +include("strings/basic.jl") +include("strings/search.jl") +include("strings/util.jl") +include("strings/io.jl") diff --git a/base/strings/basic.jl b/base/strings/basic.jl new file mode 100644 index 0000000000000..3a1ae110976ae --- /dev/null +++ b/base/strings/basic.jl @@ -0,0 +1,270 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +## core string functions ## + +endof(s::AbstractString) = error("you must implement endof(", typeof(s), ")") +next(s::AbstractString, i::Int) = error("you must implement next(", typeof(s), ",Int)") +next(s::DirectIndexString, i::Int) = (s[i],i+1) +next(s::AbstractString, i::Integer) = next(s,Int(i)) + +string() = "" +string(s::AbstractString) = s + +bytestring() = "" +bytestring(s::Vector{UInt8}) = bytestring(pointer(s),length(s)) + +function bytestring(p::Union{Ptr{UInt8},Ptr{Int8}}) + p == C_NULL ? throw(ArgumentError("cannot convert NULL to string")) : + ccall(:jl_cstr_to_string, ByteString, (Ptr{UInt8},), p) +end +bytestring(s::Cstring) = bytestring(box(Ptr{Cchar}, unbox(Cstring,s))) + +function bytestring(p::Union{Ptr{UInt8},Ptr{Int8}},len::Integer) + p == C_NULL ? throw(ArgumentError("cannot convert NULL to string")) : + ccall(:jl_pchar_to_string, ByteString, (Ptr{UInt8},Int), p, len) +end + +convert(::Type{Vector{UInt8}}, s::AbstractString) = bytestring(s).data +convert(::Type{Array{UInt8}}, s::AbstractString) = bytestring(s).data +convert(::Type{ByteString}, s::AbstractString) = bytestring(s) +convert(::Type{Vector{Char}}, s::AbstractString) = collect(s) +convert(::Type{Symbol}, s::AbstractString) = symbol(s) + +## generic supplied functions ## + +start(s::AbstractString) = 1 +done(s::AbstractString,i) = (i > endof(s)) +getindex(s::AbstractString, i::Int) = next(s,i)[1] +getindex(s::AbstractString, i::Integer) = s[Int(i)] +getindex(s::AbstractString, x::Real) = s[to_index(x)] +getindex{T<:Integer}(s::AbstractString, r::UnitRange{T}) = s[Int(first(r)):Int(last(r))] +# TODO: handle other ranges with stride ±1 specially? +getindex(s::AbstractString, v::AbstractVector) = + sprint(length(v), io->(for i in v write(io,s[i]) end)) + +symbol(s::AbstractString) = symbol(bytestring(s)) + +sizeof(s::AbstractString) = error("type $(typeof(s)) has no canonical binary representation") + +eltype{T<:AbstractString}(::Type{T}) = Char + +(*)(s1::AbstractString, ss::AbstractString...) = string(s1, ss...) + +length(s::DirectIndexString) = endof(s) +function length(s::AbstractString) + i = start(s) + if done(s,i) + return 0 + end + n = 1 + while true + c, j = next(s,i) + if done(s,j) + return n + end + n += 1 + i = j + end +end + +## String comparison functions ## + +function cmp(a::AbstractString, b::AbstractString) + if a === b + return 0 + end + i = start(a) + j = start(b) + while !done(a,i) && !done(b,i) + c, i = next(a,i) + d, j = next(b,j) + if c != d + return c < d ? -1 : +1 + end + end + done(a,i) && !done(b,j) ? -1 : + !done(a,i) && done(b,j) ? +1 : 0 +end + +==(a::AbstractString, b::AbstractString) = cmp(a,b) == 0 +isless(a::AbstractString, b::AbstractString) = cmp(a,b) < 0 + +# faster comparisons for byte strings and symbols + +cmp(a::ByteString, b::ByteString) = lexcmp(a.data, b.data) +cmp(a::Symbol, b::Symbol) = Int(sign(ccall(:strcmp, Int32, (Cstring, Cstring), a, b))) + +==(a::ByteString, b::ByteString) = endof(a) == endof(b) && cmp(a,b) == 0 +isless(a::Symbol, b::Symbol) = cmp(a,b) < 0 + +## Generic validation functions ## + +isvalid(s::DirectIndexString, i::Integer) = (start(s) <= i <= endof(s)) +function isvalid(s::AbstractString, i::Integer) + i < 1 && return false + done(s,i) && return false + try + next(s,i) + true + catch + false + end +end + +## Generic indexing functions ## + +prevind(s::DirectIndexString, i::Integer) = i-1 +prevind(s::AbstractArray , i::Integer) = i-1 +nextind(s::DirectIndexString, i::Integer) = i+1 +nextind(s::AbstractArray , i::Integer) = i+1 + +function prevind(s::AbstractString, i::Integer) + e = endof(s) + if i > e + return e + end + j = i-1 + while j >= 1 + if isvalid(s,j) + return j + end + j -= 1 + end + return 0 # out of range +end + +function nextind(s::AbstractString, i::Integer) + e = endof(s) + if i < 1 + return 1 + end + if i > e + return i+1 + end + for j = i+1:e + if isvalid(s,j) + return j + end + end + next(s,e)[2] # out of range +end + +checkbounds(s::AbstractString, i::Integer) = start(s) <= i <= endof(s) || throw(BoundsError(s, i)) +checkbounds(s::AbstractString, i::Real) = checkbounds(s, to_index(i)) +checkbounds{T<:Integer}(s::AbstractString, r::Range{T}) = isempty(r) || (minimum(r) >= start(s) && maximum(r) <= endof(s)) || throw(BoundsError(s, r)) +checkbounds{T<:Real}(s::AbstractString, I::AbstractArray{T}) = all(i -> checkbounds(s, i), I) + +ind2chr(s::DirectIndexString, i::Integer) = begin checkbounds(s,i); i end +chr2ind(s::DirectIndexString, i::Integer) = begin checkbounds(s,i); i end + +function ind2chr(s::AbstractString, i::Integer) + s[i] # throws error if invalid + j = 1 + k = start(s) + while true + c, l = next(s,k) + if i <= k + return j + end + j += 1 + k = l + end +end + +function chr2ind(s::AbstractString, i::Integer) + i < start(s) && throw(BoundsError(s, i)) + j = 1 + k = start(s) + while true + c, l = next(s,k) + if i == j + return k + end + j += 1 + k = l + end +end + +immutable EachStringIndex{T<:AbstractString} + s::T +end +eachindex(s::AbstractString) = EachStringIndex(s) + +length(e::EachStringIndex) = length(e.s) +start(e::EachStringIndex) = start(e.s) +next(e::EachStringIndex, state) = (state, nextind(e.s, state)) +done(e::EachStringIndex, state) = done(e.s, state) +eltype(e::EachStringIndex) = Int + +typealias Chars Union{Char,AbstractVector{Char},Set{Char}} + +typealias ByteArray Union{Vector{UInt8},Vector{Int8}} + +## character column width function ## + +strwidth(s::AbstractString) = (w=0; for c in s; w += charwidth(c); end; w) + +isascii(c::Char) = c < Char(0x80) +isascii(s::AbstractString) = all(isascii, s) +isascii(s::ASCIIString) = true + +## string promotion rules ## + +promote_rule{S<:AbstractString,T<:AbstractString}(::Type{S}, ::Type{T}) = UTF8String + +isxdigit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F' +isxdigit(s::AbstractString) = all(isxdigit, s) +need_full_hex(s::AbstractString, i::Int) = !done(s,i) && isxdigit(next(s,i)[1]) + +## checking UTF-8 & ACSII validity ## + +byte_string_classify(data::Vector{UInt8}) = + ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), data, length(data)) +byte_string_classify(s::ByteString) = byte_string_classify(s.data) + # 0: neither valid ASCII nor UTF-8 + # 1: valid ASCII + # 2: valid UTF-8 + +isvalid(::Type{ASCIIString}, s::Union{Vector{UInt8},ByteString}) = byte_string_classify(s) == 1 +isvalid(::Type{UTF8String}, s::Union{Vector{UInt8},ByteString}) = byte_string_classify(s) != 0 + +## uppercase and lowercase transformations ## +uppercase(s::AbstractString) = map(uppercase, s) +lowercase(s::AbstractString) = map(lowercase, s) + +function ucfirst(s::AbstractString) + isempty(s) || isupper(s[1]) ? s : string(uppercase(s[1]),s[nextind(s,1):end]) +end +function lcfirst(s::AbstractString) + isempty(s) || islower(s[1]) ? s : string(lowercase(s[1]),s[nextind(s,1):end]) +end + +## string map, filter, has ## + +map_result(s::AbstractString, a::Vector{UInt8}) = UTF8String(a) +map_result(s::Union{ASCIIString,SubString{ASCIIString}}, a::Vector{UInt8}) = bytestring(a) + +function map(f, s::AbstractString) + out = IOBuffer(Array(UInt8,endof(s)),true,true) + truncate(out,0) + for c in s + c2 = f(c) + if !isa(c2,Char) + throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead")) + end + write(out, c2::Char) + end + map_result(s, takebuf_array(out)) +end + +function filter(f, s::AbstractString) + out = IOBuffer(Array(UInt8,endof(s)),true,true) + truncate(out,0) + for c in s + if f(c) + write(out, c) + end + end + takebuf_string(out) +end + diff --git a/base/strings/io.jl b/base/strings/io.jl new file mode 100644 index 0000000000000..aade25070336e --- /dev/null +++ b/base/strings/io.jl @@ -0,0 +1,247 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +## core text I/O ## + +print(io::IO, x) = show(io, x) +print(io::IO, xs...) = for x in xs print(io, x) end + +println(io::IO, xs...) = print(io, xs..., '\n') + +print(xs...) = print(STDOUT, xs...) +println(xs...) = println(STDOUT, xs...) + +## conversion of general objects to strings ## + +function print_to_string(xs...) + # specialized for performance reasons + s = IOBuffer(Array(UInt8,isa(xs[1],AbstractString) ? endof(xs[1]) : 0), true, true) + for x in xs + print(s, x) + end + d = s.data + resize!(d,s.size) + bytestring(d) +end + +string(xs...) = print_to_string(xs...) +bytestring(s::AbstractString...) = print_to_string(s...) + +print(io::IO, s::AbstractString) = (write(io, s); nothing) +write(io::IO, s::AbstractString) = (len = 0; for c in s; len += write(io, c); end; len) +show(io::IO, s::AbstractString) = print_quoted(io, s) + +write{T<:ByteString}(to::AbstractIOBuffer, s::SubString{T}) = + s.endof==0 ? 0 : write_sub(to, s.string.data, s.offset + 1, nextind(s, s.endof) - 1) + +## printing literal quoted string data ## + +# this is the inverse of print_unescaped_chars(io, s, "\\\") + +function print_quoted_literal(io, s::AbstractString) + print(io, '"') + for c = s; c == '"' ? print(io, "\\\"") : print(io, c); end + print(io, '"') +end + +function repr(x) + s = IOBuffer() + showall(s, x) + takebuf_string(s) +end + +# IOBuffer views of a (byte)string: +IOBuffer(str::ByteString) = IOBuffer(str.data) +IOBuffer{T<:ByteString}(s::SubString{T}) = IOBuffer(sub(s.string.data, s.offset + 1 : s.offset + sizeof(s))) + +# join is implemented using IO +function print_joined(io, strings, delim, last) + i = start(strings) + if done(strings,i) + return + end + str, i = next(strings,i) + print(io, str) + is_done = done(strings,i) + while !is_done + str, i = next(strings,i) + is_done = done(strings,i) + print(io, is_done ? last : delim) + print(io, str) + end +end + +function print_joined(io, strings, delim) + i = start(strings) + is_done = done(strings,i) + while !is_done + str, i = next(strings,i) + is_done = done(strings,i) + print(io, str) + if !is_done + print(io, delim) + end + end +end +print_joined(io, strings) = print_joined(io, strings, "") + +join(args...) = sprint(print_joined, args...) + +## string escaping & unescaping ## + +escape_nul(s::AbstractString, i::Int) = + !done(s,i) && '0' <= next(s,i)[1] <= '7' ? "\\x00" : "\\0" + +function print_escaped(io, s::AbstractString, esc::AbstractString) + i = start(s) + while !done(s,i) + c, j = next(s,i) + c == '\0' ? print(io, escape_nul(s,j)) : + c == '\e' ? print(io, "\\e") : + c == '\\' ? print(io, "\\\\") : + c in esc ? print(io, '\\', c) : + '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : + isprint(c) ? print(io, c) : + c <= '\x7f' ? print(io, "\\x", hex(c, 2)) : + c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : + print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4)) + i = j + end +end + +escape_string(s::AbstractString) = sprint(endof(s), print_escaped, s, "\"") +function print_quoted(io, s::AbstractString) + print(io, '"') + print_escaped(io, s, "\"\$") #"# work around syntax highlighting problem + print(io, '"') +end + +# bare minimum unescaping function unescapes only given characters + +function print_unescaped_chars(io, s::AbstractString, esc::AbstractString) + if !('\\' in esc) + esc = string("\\", esc) + end + i = start(s) + while !done(s,i) + c, i = next(s,i) + if c == '\\' && !done(s,i) && s[i] in esc + c, i = next(s,i) + end + print(io, c) + end +end + +unescape_chars(s::AbstractString, esc::AbstractString) = + sprint(endof(s), print_unescaped_chars, s, esc) + +# general unescaping of traditional C and Unicode escape sequences + +function print_unescaped(io, s::AbstractString) + i = start(s) + while !done(s,i) + c, i = next(s,i) + if !done(s,i) && c == '\\' + c, i = next(s,i) + if c == 'x' || c == 'u' || c == 'U' + n = k = 0 + m = c == 'x' ? 2 : + c == 'u' ? 4 : 8 + while (k+=1) <= m && !done(s,i) + c, j = next(s,i) + n = '0' <= c <= '9' ? n<<4 + c-'0' : + 'a' <= c <= 'f' ? n<<4 + c-'a'+10 : + 'A' <= c <= 'F' ? n<<4 + c-'A'+10 : break + i = j + end + if k == 1 + throw(ArgumentError("\\x used with no following hex digits in $(repr(s))")) + end + if m == 2 # \x escape sequence + write(io, UInt8(n)) + else + print(io, Char(n)) + end + elseif '0' <= c <= '7' + k = 1 + n = c-'0' + while (k+=1) <= 3 && !done(s,i) + c, j = next(s,i) + n = ('0' <= c <= '7') ? n<<3 + c-'0' : break + i = j + end + if n > 255 + throw(ArgumentError("octal escape sequence out of range")) + end + write(io, UInt8(n)) + else + print(io, c == 'a' ? '\a' : + c == 'b' ? '\b' : + c == 't' ? '\t' : + c == 'n' ? '\n' : + c == 'v' ? '\v' : + c == 'f' ? '\f' : + c == 'r' ? '\r' : + c == 'e' ? '\e' : c) + end + else + print(io, c) + end + end +end + +unescape_string(s::AbstractString) = sprint(endof(s), print_unescaped, s) + +macro b_str(s); :($(unescape_string(s)).data); end + +## Count indentation, unindent ## + +function blank_width(c::Char) + c == ' ' ? 1 : + c == '\t' ? 8 : + throw(ArgumentError("$(repr(c)) not a blank character")) +end + +# width of leading blank space, also check if string is blank +function indentation(s::AbstractString) + count = 0 + for c in s + if c == ' ' || c == '\t' + count += blank_width(c) + else + return count, false + end + end + count, true +end + +function unindent(s::AbstractString, indent::Int) + indent == 0 && return s + buf = IOBuffer(Array(UInt8,endof(s)), true, true) + truncate(buf,0) + a = i = start(s) + cutting = false + cut = 0 + while !done(s,i) + c,i_ = next(s,i) + if cutting && (c == ' ' || c == '\t') + a = i_ + cut += blank_width(c) + if cut == indent + cutting = false + elseif cut > indent + cutting = false + for _ = (indent+1):cut write(buf, ' ') end + end + elseif c == '\n' + print(buf, s[a:i]) + a = i_ + cutting = true + cut = 0 + else + cutting = false + end + i = i_ + end + print(buf, s[a:end]) + takebuf_string(buf) +end diff --git a/base/strings/search.jl b/base/strings/search.jl new file mode 100644 index 0000000000000..1bd242d22e4f3 --- /dev/null +++ b/base/strings/search.jl @@ -0,0 +1,360 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +function search(s::AbstractString, c::Chars, i::Integer) + if isempty(c) + return 1 <= i <= nextind(s,endof(s)) ? i : + throw(BoundsError(s, i)) + end + if i < 1 || i > nextind(s,endof(s)) + throw(BoundsError(s, i)) + end + while !done(s,i) + d, j = next(s,i) + if d in c + return i + end + i = j + end + return 0 +end +search(s::AbstractString, c::Chars) = search(s,c,start(s)) + +in(c::Char, s::AbstractString) = (search(s,c)!=0) + +function _searchindex(s, t, i) + if isempty(t) + return 1 <= i <= nextind(s,endof(s)) ? i : + throw(BoundsError(s, i)) + end + t1, j2 = next(t,start(t)) + while true + i = search(s,t1,i) + if i == 0 return 0 end + c, ii = next(s,i) + j = j2; k = ii + matched = true + while !done(t,j) + if done(s,k) + matched = false + break + end + c, k = next(s,k) + d, j = next(t,j) + if c != d + matched = false + break + end + end + if matched + return i + end + i = ii + end +end + +function _search_bloom_mask(c) + UInt64(1) << (c & 63) +end + +function _searchindex(s::Array, t::Array, i) + n = length(t) + m = length(s) + + if n == 0 + return 1 <= i <= m+1 ? max(1, i) : 0 + elseif m == 0 + return 0 + elseif n == 1 + return search(s, t[1], i) + end + + w = m - n + if w < 0 || i - 1 > w + return 0 + end + + bloom_mask = UInt64(0) + skip = n - 1 + tlast = t[end] + for j in 1:n + bloom_mask |= _search_bloom_mask(t[j]) + if t[j] == tlast && j < n + skip = n - j - 1 + end + end + + i -= 1 + while i <= w + if s[i+n] == tlast + # check candidate + j = 0 + while j < n - 1 + if s[i+j+1] != t[j+1] + break + end + j += 1 + end + + # match found + if j == n - 1 + return i+1 + end + + # no match, try to rule out the next character + if i < w && bloom_mask & _search_bloom_mask(s[i+n+1]) == 0 + i += n + else + i += skip + end + elseif i < w + if bloom_mask & _search_bloom_mask(s[i+n+1]) == 0 + i += n + end + end + i += 1 + end + + 0 +end + +searchindex(s::ByteArray, t::ByteArray, i) = _searchindex(s,t,i) +searchindex(s::AbstractString, t::AbstractString, i::Integer) = _searchindex(s,t,i) +searchindex(s::AbstractString, t::AbstractString) = searchindex(s,t,start(s)) +searchindex(s::AbstractString, c::Char, i::Integer) = _searchindex(s,c,i) +searchindex(s::AbstractString, c::Char) = searchindex(s,c,start(s)) + +function searchindex(s::ByteString, t::ByteString, i::Integer=1) + # Check for fast case of a single byte + # (for multi-byte UTF-8 sequences, use searchindex on byte arrays instead) + if endof(t) == 1 + search(s, t[1], i) + else + searchindex(s.data, t.data, i) + end +end + +function search(s::ByteArray, t::ByteArray, i) + idx = searchindex(s,t,i) + if isempty(t) + idx:idx-1 + else + idx:(idx > 0 ? idx + endof(t) - 1 : -1) + end +end + +function search(s::AbstractString, t::AbstractString, i::Integer=start(s)) + idx = searchindex(s,t,i) + if isempty(t) + idx:idx-1 + else + idx:(idx > 0 ? idx + endof(t) - 1 : -1) + end +end + +function rsearch(s::AbstractString, c::Chars) + j = search(RevString(s), c) + j == 0 && return 0 + endof(s)-j+1 +end + +function rsearch(s::AbstractString, c::Chars, i::Integer) + e = endof(s) + j = search(RevString(s), c, e-i+1) + j == 0 && return 0 + e-j+1 +end + +function _rsearchindex(s, t, i) + if isempty(t) + return 1 <= i <= nextind(s,endof(s)) ? i : + throw(BoundsError(s, i)) + end + t = RevString(t) + rs = RevString(s) + l = endof(s) + t1, j2 = next(t,start(t)) + while true + i = rsearch(s,t1,i) + if i == 0 return 0 end + c, ii = next(rs,l-i+1) + j = j2; k = ii + matched = true + while !done(t,j) + if done(rs,k) + matched = false + break + end + c, k = next(rs,k) + d, j = next(t,j) + if c != d + matched = false + break + end + end + if matched + return nextind(s,l-k+1) + end + i = l-ii+1 + end +end + +function _rsearchindex(s::Array, t::Array, k) + n = length(t) + m = length(s) + + if n == 0 + return 0 <= k <= m ? max(k, 1) : 0 + elseif m == 0 + return 0 + elseif n == 1 + return rsearch(s, t[1], k) + end + + w = m - n + if w < 0 || k <= 0 + return 0 + end + + bloom_mask = UInt64(0) + skip = n - 1 + tfirst = t[1] + for j in n:-1:1 + bloom_mask |= _search_bloom_mask(t[j]) + if t[j] == tfirst && j > 1 + skip = j - 2 + end + end + + i = min(k - n + 1, w + 1) + while i > 0 + if s[i] == tfirst + # check candidate + j = 1 + while j < n + if s[i+j] != t[j+1] + break + end + j += 1 + end + + # match found + if j == n + return i + end + + # no match, try to rule out the next character + if i > 1 && bloom_mask & _search_bloom_mask(s[i-1]) == 0 + i -= n + else + i -= skip + end + elseif i > 1 + if bloom_mask & _search_bloom_mask(s[i-1]) == 0 + i -= n + end + end + i -= 1 + end + + 0 +end + +rsearchindex(s::ByteArray,t::ByteArray,i) = _rsearchindex(s,t,i) +rsearchindex(s::AbstractString, t::AbstractString, i::Integer) = _rsearchindex(s,t,i) +rsearchindex(s::AbstractString, t::AbstractString) = (isempty(s) && isempty(t)) ? 1 : rsearchindex(s,t,endof(s)) + +function rsearchindex(s::ByteString, t::ByteString) + # Check for fast case of a single byte + # (for multi-byte UTF-8 sequences, use rsearchindex instead) + if endof(t) == 1 + rsearch(s, t[1]) + else + _rsearchindex(s.data, t.data, length(s.data)) + end +end + +function rsearchindex(s::ByteString, t::ByteString, i::Integer) + # Check for fast case of a single byte + # (for multi-byte UTF-8 sequences, use rsearchindex instead) + if endof(t) == 1 + rsearch(s, t[1], i) + elseif endof(t) != 0 + _rsearchindex(s.data, t.data, nextind(s, i)-1) + elseif i > sizeof(s) + return 0 + elseif i == 0 + return 1 + else + return i + end +end + +function rsearch(s::ByteArray, t::ByteArray, i::Integer) + idx = rsearchindex(s,t,i) + if isempty(t) + idx:idx-1 + else + idx:(idx > 0 ? idx + endof(t) - 1 : -1) + end +end + +function rsearch(s::AbstractString, t::AbstractString, i::Integer=endof(s)) + idx = rsearchindex(s,t,i) + if isempty(t) + idx:idx-1 + else + idx:(idx > 0 ? idx + endof(t) - 1 : -1) + end +end + +contains(haystack::AbstractString, needle::AbstractString) = searchindex(haystack,needle)!=0 + +in(::AbstractString, ::AbstractString) = error("use contains(x,y) for string containment") + +# ByteArray optimizations + +# find the index of the first occurrence of a value in a byte array + +function search(a::ByteArray, b::Union{Int8,UInt8}, i::Integer) + if i < 1 + throw(BoundsError(a, i)) + end + n = length(a) + if i > n + return i == n+1 ? 0 : throw(BoundsError(a, i)) + end + p = pointer(a) + q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1) + q == C_NULL ? 0 : Int(q-p+1) +end +function search(a::ByteArray, b::Char, i::Integer) + if isascii(b) + search(a,UInt8(b),i) + else + search(a,string(b).data,i).start + end +end +search(a::ByteArray, b::Union{Int8,UInt8,Char}) = search(a,b,1) + +function rsearch(a::ByteArray, b::Union{Int8,UInt8}, i::Integer) + if i < 1 + return i == 0 ? 0 : throw(BoundsError(a, i)) + end + n = length(a) + if i > n + return i == n+1 ? 0 : throw(BoundsError(a, i)) + end + p = pointer(a) + q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i) + q == C_NULL ? 0 : Int(q-p+1) +end +function rsearch(a::ByteArray, b::Char, i::Integer) + if isascii(b) + rsearch(a,UInt8(b),i) + else + rsearch(a,string(b).data,i).start + end +end +rsearch(a::ByteArray, b::Union{Int8,UInt8,Char}) = rsearch(a,b,length(a)) + + diff --git a/base/strings/types.jl b/base/strings/types.jl new file mode 100644 index 0000000000000..1bc4f5b9f28c0 --- /dev/null +++ b/base/strings/types.jl @@ -0,0 +1,224 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# SubString, RevString, RepString, and RopeString types + +## substrings reference original strings ## + +immutable SubString{T<:AbstractString} <: AbstractString + string::T + offset::Int + endof::Int + + function SubString(s::T, i::Int, j::Int) + if i > endof(s) || j i + j -= 1 + end + + o = i-1 + new(s, o, max(0, j-o)) + end + end +end +SubString{T<:AbstractString}(s::T, i::Int, j::Int) = SubString{T}(s, i, j) +SubString(s::SubString, i::Int, j::Int) = SubString(s.string, s.offset+i, s.offset+j) +SubString(s::AbstractString, i::Integer, j::Integer) = SubString(s, Int(i), Int(j)) +SubString(s::AbstractString, i::Integer) = SubString(s, i, endof(s)) + +sizeof(s::SubString{ASCIIString}) = s.endof +sizeof(s::SubString{UTF8String}) = s.endof == 0 ? 0 : nextind(s, s.endof) - 1 + +# TODO: length(s::SubString) = ?? +# default implementation will work but it's slow +# can this be delegated efficiently somehow? +# that may require additional string interfaces +length{T<:DirectIndexString}(s::SubString{T}) = endof(s) + +function length(s::SubString{UTF8String}) + return s.endof==0 ? 0 : Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t), + pointer(s), nextind(s, s.endof) - 1)) +end + +function next(s::SubString, i::Int) + if i < 1 || i > s.endof + throw(BoundsError(s, i)) + end + c, i = next(s.string, i+s.offset) + c, i-s.offset +end + +function getindex(s::SubString, i::Int) + if i < 1 || i > s.endof + throw(BoundsError(s, i)) + end + getindex(s.string, i+s.offset) +end + +endof(s::SubString) = s.endof + +function isvalid(s::SubString, i::Integer) + return (start(s) <= i <= endof(s)) && isvalid(s.string, s.offset+i) +end + +isvalid{T<:DirectIndexString}(s::SubString{T}, i::Integer) = (start(s) <= i <= endof(s)) + +ind2chr{T<:DirectIndexString}(s::SubString{T}, i::Integer) = begin checkbounds(s,i); i end +chr2ind{T<:DirectIndexString}(s::SubString{T}, i::Integer) = begin checkbounds(s,i); i end + +nextind(s::SubString, i::Integer) = nextind(s.string, i+s.offset)-s.offset +prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset + +convert{T<:AbstractString}(::Type{SubString{T}}, s::T) = SubString(s, 1, endof(s)) + +bytestring{T <: ByteString}(p::SubString{T}) = bytestring(p.string.data[1+p.offset:p.offset+nextind(p, p.endof)-1]) + +function getindex(s::AbstractString, r::UnitRange{Int}) + if first(r) < 1 || endof(s) < last(r) + throw(BoundsError(s, r)) + end + SubString(s, first(r), last(r)) +end + +isascii(s::SubString{ASCIIString}) = true + +function cmp{T<:ByteString,S<:ByteString}(a::SubString{T}, b::SubString{S}) + na = sizeof(a) + nb = sizeof(b) + c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), + pointer(a), pointer(b), min(na,nb)) + c < 0 ? -1 : c > 0 ? +1 : cmp(na,nb) +end + +## reversed strings without data movement ## + +immutable RevString{T<:AbstractString} <: AbstractString + string::T +end + +endof(s::RevString) = endof(s.string) +length(s::RevString) = length(s.string) +sizeof(s::RevString) = sizeof(s.string) + +function next(s::RevString, i::Int) + n = endof(s); j = n-i+1 + (s.string[j], n-prevind(s.string,j)+1) +end + +reverse(s::AbstractString) = RevString(s) +reverse(s::RevString) = s.string + +isascii(s::RevString{ASCIIString}) = true + +## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)] + +reverseind(s::Union{DirectIndexString,SubString{DirectIndexString}}, i::Integer) = length(s) + 1 - i +reverseind(s::RevString, i::Integer) = endof(s) - i + 1 +lastidx(s::AbstractString) = nextind(s, endof(s)) - 1 +lastidx(s::DirectIndexString) = length(s) +reverseind(s::SubString, i::Integer) = + reverseind(s.string, lastidx(s.string)-s.offset-s.endof+i) - s.offset + +## efficient representation of repeated strings ## + +immutable RepString <: AbstractString + string::AbstractString + repeat::Integer +end + +function endof(s::RepString) + e = endof(s.string) + (next(s.string,e)[2]-1) * (s.repeat-1) + e +end +length(s::RepString) = length(s.string)*s.repeat +sizeof(s::RepString) = sizeof(s.string)*s.repeat + +function next(s::RepString, i::Int) + if i < 1 + throw(BoundsError(s, i)) + end + e = endof(s.string) + sz = next(s.string,e)[2]-1 + + r, j = divrem(i-1, sz) + j += 1 + + if r >= s.repeat || j > e + throw(BoundsError(s, i)) + end + + c, k = next(s.string, j) + c, k-j+i +end + +function repeat(s::AbstractString, r::Integer) + r < 0 ? throw(ArgumentError("can't repeat a string $r times")) : + r == 0 ? "" : + r == 1 ? s : + RepString(s,r) +end + +convert(::Type{RepString}, s::AbstractString) = RepString(s,1) + +function repeat(s::ByteString, r::Integer) + r < 0 && throw(ArgumentError("can't repeat a string $r times")) + d = s.data; n = length(d) + out = Array(UInt8, n*r) + for i=1:r + copy!(out, 1+(i-1)*n, d, 1, n) + end + convert(typeof(s), out) +end + +(^)(s::AbstractString, r::Integer) = repeat(s,r) + +## ropes for efficient concatenation, etc. ## + +immutable RopeString <: AbstractString + head::AbstractString + tail::AbstractString + depth::Int32 + endof::Int + + RopeString(h::RopeString, t::RopeString) = + strdepth(h.tail) + strdepth(t) < strdepth(h.head) ? + RopeString(h.head, RopeString(h.tail, t)) : + new(h, t, max(h.depth,t.depth)+1, endof(h)+endof(t)) + + RopeString(h::RopeString, t::AbstractString) = + strdepth(h.tail) < strdepth(h.head) ? + RopeString(h.head, RopeString(h.tail, t)) : + new(h, t, h.depth+1, endof(h)+endof(t)) + + RopeString(h::AbstractString, t::RopeString) = + strdepth(t.head) < strdepth(t.tail) ? + RopeString(RopeString(h, t.head), t.tail) : + new(h, t, t.depth+1, endof(h)+endof(t)) + + RopeString(h::AbstractString, t::AbstractString) = + new(h, t, 1, endof(h)+endof(t)) +end +RopeString(s::AbstractString) = RopeString(s,"") + +strdepth(s::AbstractString) = 0 +strdepth(s::RopeString) = s.depth + +function next(s::RopeString, i::Int) + eh = endof(s.head) + if i <= eh + return next(s.head, i) + else + c, j = next(s.tail, i-eh) + return c, j+eh + end +end + +endof(s::RopeString) = s.endof +length(s::RopeString) = length(s.head) + length(s.tail) +write(io::IO, s::RopeString) = (write(io, s.head); write(io, s.tail)) +sizeof(s::RopeString) = sizeof(s.head) + sizeof(s.tail) diff --git a/base/strings/util.jl b/base/strings/util.jl new file mode 100644 index 0000000000000..c26107c71663a --- /dev/null +++ b/base/strings/util.jl @@ -0,0 +1,233 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# starts with and ends with predicates + +function startswith(a::AbstractString, b::AbstractString) + i = start(a) + j = start(b) + while !done(a,i) && !done(b,i) + c, i = next(a,i) + d, j = next(b,j) + if c != d return false end + end + done(b,i) +end +startswith(str::AbstractString, chars::Chars) = !isempty(str) && str[start(str)] in chars + +function endswith(a::AbstractString, b::AbstractString) + i = endof(a) + j = endof(b) + a1 = start(a) + b1 = start(b) + while a1 <= i && b1 <= j + c = a[i] + d = b[j] + if c != d return false end + i = prevind(a,i) + j = prevind(b,j) + end + j < b1 +end +endswith(str::AbstractString, chars::Chars) = !isempty(str) && str[end] in chars + +startswith(a::ByteString, b::ByteString) = startswith(a.data, b.data) +startswith(a::Vector{UInt8}, b::Vector{UInt8}) = + (length(a) >= length(b) && ccall(:strncmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0) + +# TODO: fast endswith + +chop(s::AbstractString) = s[1:end-1] + +function chomp(s::AbstractString) + i = endof(s) + if (i < 1 || s[i] != '\n') return s end + j = prevind(s,i) + if (j < 1 || s[j] != '\r') return s[1:i-1] end + return s[1:j-1] +end +chomp(s::ByteString) = + (endof(s) < 1 || s.data[end] != 0x0a) ? s : + (endof(s) < 2 || s.data[end-1] != 0x0d) ? s[1:end-1] : s[1:end-2] + +# NOTE: use with caution -- breaks the immutable string convention! +function chomp!(s::ByteString) + if !isempty(s) && s.data[end] == 0x0a + n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2 + ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n) + end + return s +end +chomp!(s::AbstractString) = chomp(s) # copying fallback for other string types + +const _default_delims = [' ','\t','\n','\v','\f','\r'] + +function lstrip(s::AbstractString, chars::Chars=_default_delims) + i = start(s) + while !done(s,i) + c, j = next(s,i) + if !(c in chars) + return s[i:end] + end + i = j + end + "" +end + +function rstrip(s::AbstractString, chars::Chars=_default_delims) + r = RevString(s) + i = start(r) + while !done(r,i) + c, j = next(r,i) + if !(c in chars) + return s[1:end-i+1] + end + i = j + end + "" +end + +strip(s::AbstractString) = lstrip(rstrip(s)) +strip(s::AbstractString, chars::Chars) = lstrip(rstrip(s, chars), chars) + +## String padding functions ## + +function lpad(s::AbstractString, n::Integer, p::AbstractString=" ") + m = n - strwidth(s) + if m <= 0; return s; end + l = strwidth(p) + if l==1 + return bytestring(p^m * s) + end + q = div(m,l) + r = m - q*l + i = r != 0 ? chr2ind(p, r) : -1 + bytestring(p^q*p[1:i]*s) +end + +function rpad(s::AbstractString, n::Integer, p::AbstractString=" ") + m = n - strwidth(s) + if m <= 0; return s; end + l = strwidth(p) + if l==1 + return bytestring(s * p^m) + end + q = div(m,l) + r = m - q*l + i = r != 0 ? chr2ind(p, r) : -1 + bytestring(s*p^q*p[1:i]) +end + +lpad(s, n::Integer, p=" ") = lpad(string(s),n,string(p)) +rpad(s, n::Integer, p=" ") = rpad(string(s),n,string(p)) +cpad(s, n::Integer, p=" ") = rpad(lpad(s,div(n+strwidth(s),2),p),n,p) + +# splitter can be a Char, Vector{Char}, AbstractString, Regex, ... +# any splitter that provides search(s::AbstractString, splitter) +split{T<:SubString}(str::T, splitter; limit::Integer=0, keep::Bool=true) = _split(str, splitter, limit, keep, T[]) +split{T<:AbstractString}(str::T, splitter; limit::Integer=0, keep::Bool=true) = _split(str, splitter, limit, keep, SubString{T}[]) +function _split{T<:AbstractString,U<:Array}(str::T, splitter, limit::Integer, keep_empty::Bool, strs::U) + i = start(str) + n = endof(str) + r = search(str,splitter,i) + j, k = first(r), nextind(str,last(r)) + while 0 < j <= n && length(strs) != limit-1 + if i < k + if keep_empty || i < j + push!(strs, SubString(str,i,prevind(str,j))) + end + i = k + end + if k <= j; k = nextind(str,j) end + r = search(str,splitter,k) + j, k = first(r), nextind(str,last(r)) + end + if keep_empty || !done(str,i) + push!(strs, SubString(str,i)) + end + return strs +end + +# a bit oddball, but standard behavior in Perl, Ruby & Python: +split(str::AbstractString) = split(str, _default_delims; limit=0, keep=false) + +rsplit{T<:SubString}(str::T, splitter; limit::Integer=0, keep::Bool=true) = _rsplit(str, splitter, limit, keep, T[]) +rsplit{T<:AbstractString}(str::T, splitter ; limit::Integer=0, keep::Bool=true) = _rsplit(str, splitter, limit, keep, SubString{T}[]) +function _rsplit{T<:AbstractString,U<:Array}(str::T, splitter, limit::Integer, keep_empty::Bool, strs::U) + i = start(str) + n = endof(str) + r = rsearch(str,splitter) + j = first(r)-1 + k = last(r) + while((0 <= j < n) && (length(strs) != limit-1)) + if i <= k + (keep_empty || (k < n)) && unshift!(strs, SubString(str,k+1,n)) + n = j + end + (k <= j) && (j = prevind(str,j)) + r = rsearch(str,splitter,j) + j = first(r)-1 + k = last(r) + end + (keep_empty || (n > 0)) && unshift!(strs, SubString(str,1,n)) + return strs +end +#rsplit(str::AbstractString) = rsplit(str, _default_delims, 0, false) + +function replace(str::ByteString, pattern, repl::Function, limit::Integer) + n = 1 + e = endof(str) + i = a = start(str) + r = search(str,pattern,i) + j, k = first(r), last(r) + out = IOBuffer() + while j != 0 + if i == a || i <= k + write(out, SubString(str,i,prevind(str,j))) + write(out, string(repl(SubString(str,j,k)))) + end + if k e + break + end + r = search(str,pattern,k) + j, k = first(r), last(r) + n == limit && break + n += 1 + end + write(out, SubString(str,i)) + takebuf_string(out) +end +replace(s::AbstractString, pat, f::Function, n::Integer) = replace(bytestring(s), pat, f, n) +replace(s::AbstractString, pat, r, n::Integer) = replace(s, pat, x->r, n) +replace(s::AbstractString, pat, r) = replace(s, pat, r, 0) + +# hex <-> bytes conversion + +function hex2bytes(s::ASCIIString) + len = length(s) + iseven(len) || throw(ArgumentError("string length must be even: length($(repr(s))) == $len")) + arr = zeros(UInt8, div(len,2)) + i = j = 0 + while i < len + n = 0 + c = s[i+=1] + n = '0' <= c <= '9' ? c - '0' : + 'a' <= c <= 'f' ? c - 'a' + 10 : + 'A' <= c <= 'F' ? c - 'A' + 10 : + throw(ArgumentError("not a hexadecimal string: $(repr(s))")) + c = s[i+=1] + n = '0' <= c <= '9' ? n << 4 + c - '0' : + 'a' <= c <= 'f' ? n << 4 + c - 'a' + 10 : + 'A' <= c <= 'F' ? n << 4 + c - 'A' + 10 : + throw(ArgumentError("not a hexadecimal string: $(repr(s))")) + arr[j+=1] = n + end + return arr +end + +bytes2hex{T<:UInt8}(arr::Vector{T}) = join([hex(i,2) for i in arr]) diff --git a/base/sysimg.jl b/base/sysimg.jl index db3bc2f1ccff0..71fea54c4a0de 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -88,18 +88,13 @@ include("iterator.jl") include("osutils.jl") # strings & printing -include("utferror.jl") -include("utftypes.jl") -include("utfcheck.jl") include("char.jl") include("ascii.jl") -include("utf8.jl") -include("utf16.jl") -include("utf32.jl") include("iobuffer.jl") include("string.jl") -include("utf8proc.jl") -importall .UTF8proc +include("unicode.jl") +include("parse.jl") +include("shell.jl") include("regex.jl") include("base64.jl") importall .Base64 diff --git a/base/unicode.jl b/base/unicode.jl new file mode 100644 index 0000000000000..e0ed8b5d1b0a8 --- /dev/null +++ b/base/unicode.jl @@ -0,0 +1,10 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +include("unicode/UnicodeError.jl") +include("unicode/types.jl") +include("unicode/checkstring.jl") +include("unicode/utf8.jl") +include("unicode/utf16.jl") +include("unicode/utf32.jl") +include("unicode/utf8proc.jl") +importall .UTF8proc diff --git a/base/utferror.jl b/base/unicode/UnicodeError.jl similarity index 100% rename from base/utferror.jl rename to base/unicode/UnicodeError.jl diff --git a/base/utfcheck.jl b/base/unicode/checkstring.jl similarity index 100% rename from base/utfcheck.jl rename to base/unicode/checkstring.jl diff --git a/base/utftypes.jl b/base/unicode/types.jl similarity index 100% rename from base/utftypes.jl rename to base/unicode/types.jl diff --git a/base/utf16.jl b/base/unicode/utf16.jl similarity index 100% rename from base/utf16.jl rename to base/unicode/utf16.jl diff --git a/base/utf32.jl b/base/unicode/utf32.jl similarity index 67% rename from base/utf32.jl rename to base/unicode/utf32.jl index 0d481bfda353c..612a3bbe4d061 100644 --- a/base/utf32.jl +++ b/base/unicode/utf32.jl @@ -101,3 +101,37 @@ function map(f, s::UTF32String) end UTF32String(out) end + +# Definitions for C compatible strings, that don't allow embedded +# '\0', and which are terminated by a '\0' + +containsnul(s::AbstractString) = '\0' in s +containsnul(s::ByteString) = containsnul(unsafe_convert(Ptr{Cchar}, s), sizeof(s)) +containsnul(s::Union{UTF16String,UTF32String}) = findfirst(s.data, 0) != length(s.data) + +if sizeof(Cwchar_t) == 2 + const WString = UTF16String + const wstring = utf16 +elseif sizeof(Cwchar_t) == 4 + const WString = UTF32String + const wstring = utf32 +end +wstring(s::Cwstring) = wstring(box(Ptr{Cwchar_t}, unbox(Cwstring,s))) + +# Cwstring is defined in c.jl, but conversion needs to be defined here +# to have WString +function unsafe_convert(::Type{Cwstring}, s::WString) + if containsnul(s) + throw(ArgumentError("embedded NUL chars are not allowed in C strings: $(repr(s))")) + end + return Cwstring(unsafe_convert(Ptr{Cwchar_t}, s)) +end + +# pointer conversions of ASCII/UTF8/UTF16/UTF32 strings: +pointer(x::Union{ByteString,UTF16String,UTF32String}) = pointer(x.data) +pointer{T<:ByteString}(x::SubString{T}) = pointer(x.string.data) + x.offset +pointer(x::ByteString, i::Integer) = pointer(x.data)+(i-1) +pointer{T<:ByteString}(x::SubString{T}, i::Integer) = pointer(x.string.data) + x.offset + (i-1) +pointer(x::Union{UTF16String,UTF32String}, i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data)) +pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.data)) +pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.data)) diff --git a/base/utf8.jl b/base/unicode/utf8.jl similarity index 100% rename from base/utf8.jl rename to base/unicode/utf8.jl diff --git a/base/utf8proc.jl b/base/unicode/utf8proc.jl similarity index 100% rename from base/utf8proc.jl rename to base/unicode/utf8proc.jl diff --git a/test/choosetests.jl b/test/choosetests.jl index aac6a058be6a1..8dee768042cfa 100644 --- a/test/choosetests.jl +++ b/test/choosetests.jl @@ -15,12 +15,13 @@ Upon return, `tests` is a vector of fully-expanded test names, and """ -> function choosetests(choices = []) testnames = [ - "linalg", "core", "keywordargs", "numbers", "strings", + "linalg", "core", "keywordargs", "numbers", "printf", + "char", "string", "triplequote", "unicode", "dates", "dict", "hashing", "remote", "iobuffer", "staged", "arrayops", "tuple", "subarray", "reduce", "reducedim", "random", "abstractarray", "intfuncs", "simdloop", "blas", "sparse", "bitarray", "copy", "math", "fastmath", "functional", - "operators", "path", "ccall", "unicode", + "operators", "path", "ccall", "parse", "bigint", "sorting", "statistics", "spawn", "backtrace", "priorityqueue", "file", "mmap", "version", "resolve", "pollfd", "mpfr", "broadcast", "complex", "socket", @@ -29,7 +30,7 @@ function choosetests(choices = []) "euler", "show", "lineedit", "replcompletions", "repl", "replutil", "sets", "test", "goto", "llvmcall", "grisu", "nullable", "meta", "profile", "libgit2", "docs", "markdown", - "base64", "parser", "serialize", "functors", "char", "misc", + "base64", "serialize", "functors", "misc", "enums", "cmdlineargs", "i18n", "workspace", "libdl", "int" ] diff --git a/test/hashing.jl b/test/hashing.jl index 46658d230b176..0d3f51e504d51 100644 --- a/test/hashing.jl +++ b/test/hashing.jl @@ -79,7 +79,6 @@ for a in vals, b in vals @test isequal(a,b) == (hash(a)==hash(b)) end -@test hash(RopeString("1","2")) == hash("12") @test hash(SubString("--hello--",3,7)) == hash("hello") @test hash(:(X.x)) == hash(:(X.x)) @test hash(:(X.x)) != hash(:(X.y)) diff --git a/test/parser.jl b/test/parse.jl similarity index 58% rename from test/parser.jl rename to test/parse.jl index 5db51441fca18..c7d8294d98e93 100644 --- a/test/parser.jl +++ b/test/parse.jl @@ -153,3 +153,122 @@ macro f(args...) end; @f "" """) == Expr(:toplevel, Expr(:macro, Expr(:call, :f, Expr(:..., :args)), Expr(:block,)), Expr(:macrocall, symbol("@f"), "")) + +# integer parsing +@test is(parse(Int32,"0",36),Int32(0)) +@test is(parse(Int32,"1",36),Int32(1)) +@test is(parse(Int32,"9",36),Int32(9)) +@test is(parse(Int32,"A",36),Int32(10)) +@test is(parse(Int32,"a",36),Int32(10)) +@test is(parse(Int32,"B",36),Int32(11)) +@test is(parse(Int32,"b",36),Int32(11)) +@test is(parse(Int32,"F",36),Int32(15)) +@test is(parse(Int32,"f",36),Int32(15)) +@test is(parse(Int32,"Z",36),Int32(35)) +@test is(parse(Int32,"z",36),Int32(35)) + +@test parse(Int,"0") == 0 +@test parse(Int,"-0") == 0 +@test parse(Int,"1") == 1 +@test parse(Int,"-1") == -1 +@test parse(Int,"9") == 9 +@test parse(Int,"-9") == -9 +@test parse(Int,"10") == 10 +@test parse(Int,"-10") == -10 +@test parse(Int64,"3830974272") == 3830974272 +@test parse(Int64,"-3830974272") == -3830974272 +@test parse(Int,'3') == 3 +@test parse(Int,'3', 8) == 3 + +parsebin(s) = parse(Int,s,2) +parseoct(s) = parse(Int,s,8) +parsehex(s) = parse(Int,s,16) + +@test parsebin("0") == 0 +@test parsebin("-0") == 0 +@test parsebin("1") == 1 +@test parsebin("-1") == -1 +@test parsebin("10") == 2 +@test parsebin("-10") == -2 +@test parsebin("11") == 3 +@test parsebin("-11") == -3 +@test parsebin("1111000011110000111100001111") == 252645135 +@test parsebin("-1111000011110000111100001111") == -252645135 + +@test parseoct("0") == 0 +@test parseoct("-0") == 0 +@test parseoct("1") == 1 +@test parseoct("-1") == -1 +@test parseoct("7") == 7 +@test parseoct("-7") == -7 +@test parseoct("10") == 8 +@test parseoct("-10") == -8 +@test parseoct("11") == 9 +@test parseoct("-11") == -9 +@test parseoct("72") == 58 +@test parseoct("-72") == -58 +@test parseoct("3172207320") == 434704080 +@test parseoct("-3172207320") == -434704080 + +@test parsehex("0") == 0 +@test parsehex("-0") == 0 +@test parsehex("1") == 1 +@test parsehex("-1") == -1 +@test parsehex("9") == 9 +@test parsehex("-9") == -9 +@test parsehex("a") == 10 +@test parsehex("-a") == -10 +@test parsehex("f") == 15 +@test parsehex("-f") == -15 +@test parsehex("10") == 16 +@test parsehex("-10") == -16 +@test parsehex("0BADF00D") == 195948557 +@test parsehex("-0BADF00D") == -195948557 +@test parse(Int64,"BADCAB1E",16) == 3135023902 +@test parse(Int64,"-BADCAB1E",16) == -3135023902 +@test parse(Int64,"CafeBabe",16) == 3405691582 +@test parse(Int64,"-CafeBabe",16) == -3405691582 +@test parse(Int64,"DeadBeef",16) == 3735928559 +@test parse(Int64,"-DeadBeef",16) == -3735928559 + +@test parse(Int,"2\n") == 2 +@test parse(Int," 2 \n ") == 2 +@test parse(Int," 2 ") == 2 +@test parse(Int,"2 ") == 2 +@test parse(Int," 2") == 2 +@test parse(Int,"+2\n") == 2 +@test parse(Int,"-2") == -2 +@test_throws ArgumentError parse(Int," 2 \n 0") +@test_throws ArgumentError parse(Int,"2x") +@test_throws ArgumentError parse(Int,"-") + +# multibyte spaces +@test parse(Int, "3\u2003\u202F") == 3 +@test_throws ArgumentError parse(Int, "3\u2003\u202F,") + +@test parse(Int,'a') == 10 +@test_throws ArgumentError parse(Int,typemax(Char)) + +@test parse(Int,"1234") == 1234 +@test parse(Int,"0x1234") == 0x1234 +@test parse(Int,"0o1234") == 0o1234 +@test parse(Int,"0b1011") == 0b1011 +@test parse(Int,"-1234") == -1234 +@test parse(Int,"-0x1234") == -Int(0x1234) +@test parse(Int,"-0o1234") == -Int(0o1234) +@test parse(Int,"-0b1011") == -Int(0b1011) + +## FIXME: #4905, do these tests for Int128/UInt128! +for T in (Int8, Int16, Int32, Int64) + @test parse(T,string(typemin(T))) == typemin(T) + @test parse(T,string(typemax(T))) == typemax(T) + @test_throws OverflowError parse(T,string(big(typemin(T))-1)) + @test_throws OverflowError parse(T,string(big(typemax(T))+1)) +end + +for T in (UInt8,UInt16,UInt32,UInt64) + @test parse(T,string(typemin(T))) == typemin(T) + @test parse(T,string(typemax(T))) == typemax(T) + @test_throws ArgumentError parse(T,string(big(typemin(T))-1)) + @test_throws OverflowError parse(T,string(big(typemax(T))+1)) +end diff --git a/test/printf.jl b/test/printf.jl new file mode 100644 index 0000000000000..cb4c3694b2263 --- /dev/null +++ b/test/printf.jl @@ -0,0 +1,51 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# printf +# int +@test (@sprintf "%d" typemax(Int64)) == "9223372036854775807" +@test (@sprintf "%i" 42) == "42" +@test (@sprintf "%u" 42) == "42" +@test (@sprintf "Test: %i" 42) == "Test: 42" +@test (@sprintf "%#x" 42) == "0x2a" +@test (@sprintf "%#o" 42) == "052" +@test (@sprintf "%X" 42) == "2A" +@test (@sprintf "%X" 42) == "2A" +@test (@sprintf "% i" 42) == " 42" +@test (@sprintf "%+i" 42) == "+42" +@test (@sprintf "%4i" 42) == " 42" +@test (@sprintf "%-4i" 42) == "42 " +# float +@test (@sprintf "%7.2f" 1.2345) == " 1.23" +@test (@sprintf "%-7.2f" 1.2345) == "1.23 " +@test (@sprintf "%07.2f" 1.2345) == "0001.23" +@test (@sprintf "%.0f" 1.2345) == "1" +@test (@sprintf "%#.0f" 1.2345) == "1." +# Inf / NaN handling +@test (@sprintf "%f" Inf) == "Inf" +@test (@sprintf "%f" NaN) == "NaN" +# scientific notation +@test (@sprintf "%.4e" 1.2345) == "1.2345e+00" +@test (@sprintf "%.0e" 3e142) == "3e+142" +@test (@sprintf "%#.0e" 3e142) == "3.e+142" +# hex float +@test (@sprintf "%a" 1.5) == "0x1.8p+0" +@test (@sprintf "%#.0a" 1.5) == "0x2.p+0" +@test (@sprintf "%+30a" 1/3) == " +0x1.5555555555555p-2" +# chars +@test (@sprintf "%c" 65) == "A" +@test (@sprintf "%c" 'A') == "A" +@test (@sprintf "%c" 248) == "ø" +@test (@sprintf "%c" 'ø') == "ø" +# strings +@test (@sprintf "%s" "test") == "test" +@test (@sprintf "%s" "tést") == "tést" +# reasonably complex +@test (@sprintf "Test: %s%c%C%c%#-.0f." "t" 65 66 67 -42) == "Test: tABC-42.." +#test simple splatting +@test (@sprintf "%d%d" [1 2]...) == "12" +# combo +@test (@sprintf "%f %d %d %f" 1.0 [3 4]... 5) == "1.000000 3 4 5.000000" +# multi +@test (@sprintf "%s %f %9.5f %d %d %d %d%d%d%d" [1:6;]... [7,8,9,10]...) == "1 2.000000 3.00000 4 5 6 78910" +# comprehension +@test (@sprintf "%s %s %s %d %d %d %f %f %f" Any[10^x+y for x=1:3,y=1:3 ]...) == "11 101 1001 12 102 1002 13.000000 103.000000 1003.000000" diff --git a/test/string.jl b/test/string.jl new file mode 100644 index 0000000000000..df9581e7655f8 --- /dev/null +++ b/test/string.jl @@ -0,0 +1,7 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +include("strings/basic.jl") +include("strings/types.jl") +include("strings/search.jl") +include("strings/util.jl") +include("strings/io.jl") diff --git a/test/strings.jl b/test/strings.jl deleted file mode 100644 index cc304f48a097a..0000000000000 --- a/test/strings.jl +++ /dev/null @@ -1,2085 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -# string escaping & unescaping -cx = Any[ - 0x00000000 '\0' "\\0" - 0x00000001 '\x01' "\\x01" - 0x00000006 '\x06' "\\x06" - 0x00000007 '\a' "\\a" - 0x00000008 '\b' "\\b" - 0x00000009 '\t' "\\t" - 0x0000000a '\n' "\\n" - 0x0000000b '\v' "\\v" - 0x0000000c '\f' "\\f" - 0x0000000d '\r' "\\r" - 0x0000000e '\x0e' "\\x0e" - 0x0000001a '\x1a' "\\x1a" - 0x0000001b '\e' "\\e" - 0x0000001c '\x1c' "\\x1c" - 0x0000001f '\x1f' "\\x1f" - 0x00000020 ' ' " " - 0x0000002f '/' "/" - 0x00000030 '0' "0" - 0x00000039 '9' "9" - 0x0000003a ':' ":" - 0x00000040 '@' "@" - 0x00000041 'A' "A" - 0x0000005a 'Z' "Z" - 0x0000005b '[' "[" - 0x00000060 '`' "`" - 0x00000061 'a' "a" - 0x0000007a 'z' "z" - 0x0000007b '{' "{" - 0x0000007e '~' "~" - 0x0000007f '\x7f' "\\x7f" - 0x000000bf '\ubf' "\\ubf" - 0x000000ff '\uff' "\\uff" - 0x00000100 '\u100' "\\u100" - 0x000001ff '\u1ff' "\\u1ff" - 0x00000fff '\ufff' "\\ufff" - 0x00001000 '\u1000' "\\u1000" - 0x00001fff '\u1fff' "\\u1fff" - 0x0000ffff '\uffff' "\\uffff" - 0x00010000 '\U10000' "\\U10000" - 0x0001ffff '\U1ffff' "\\U1ffff" - 0x0002ffff '\U2ffff' "\\U2ffff" - 0x00030000 '\U30000' "\\U30000" - 0x000dffff '\Udffff' "\\Udffff" - 0x000e0000 '\Ue0000' "\\Ue0000" - 0x000effff '\Ueffff' "\\Ueffff" - 0x000f0000 '\Uf0000' "\\Uf0000" - 0x000fffff '\Ufffff' "\\Ufffff" - 0x00100000 '\U100000' "\\U100000" - 0x0010ffff '\U10ffff' "\\U10ffff" -] - -for i = 1:size(cx,1) - @test cx[i,1] == convert(UInt32, cx[i,2]) - @test string(cx[i,2]) == unescape_string(cx[i,3]) - if isascii(cx[i,2]) || !isprint(cx[i,2]) - @test cx[i,3] == escape_string(string(cx[i,2])) - end - for j = 1:size(cx,1) - str = string(cx[i,2], cx[j,2]) - @test str == unescape_string(escape_string(str)) - end -end - -for i = 0:0x7f, p = ["","\0","x","xxx","\x7f","\uFF","\uFFF", - "\uFFFF","\U10000","\U10FFF","\U10FFFF"] - c = Char(i) - cp = string(c,p) - op = string(Char(div(i,8)), oct(i%8), p) - hp = string(Char(div(i,16)), hex(i%16), p) - @test string(unescape_string(string("\\",oct(i,1),p))) == cp - @test string(unescape_string(string("\\",oct(i,2),p))) == cp - @test string(unescape_string(string("\\",oct(i,3),p))) == cp - @test string(unescape_string(string("\\",oct(i,4),p))) == op - @test string(unescape_string(string("\\x",hex(i,1),p))) == cp - @test string(unescape_string(string("\\x",hex(i,2),p))) == cp - @test string(unescape_string(string("\\x",hex(i,3),p))) == hp -end - -@test "\z" == unescape_string("\z") == "z" -@test "\X" == unescape_string("\X") == "X" -@test "\AbC" == unescape_string("\AbC") == "AbC" - -@test "\0" == unescape_string("\\0") -@test "\1" == unescape_string("\\1") -@test "\7" == unescape_string("\\7") -@test "\0x" == unescape_string("\\0x") -@test "\1x" == unescape_string("\\1x") -@test "\7x" == unescape_string("\\7x") -@test "\00" == unescape_string("\\00") -@test "\01" == unescape_string("\\01") -@test "\07" == unescape_string("\\07") -@test "\70" == unescape_string("\\70") -@test "\71" == unescape_string("\\71") -@test "\77" == unescape_string("\\77") -@test "\00x" == unescape_string("\\00x") -@test "\01x" == unescape_string("\\01x") -@test "\07x" == unescape_string("\\07x") -@test "\70x" == unescape_string("\\70x") -@test "\71x" == unescape_string("\\71x") -@test "\77x" == unescape_string("\\77x") -@test "\000" == unescape_string("\\000") -@test "\001" == unescape_string("\\001") -@test "\007" == unescape_string("\\007") -@test "\070" == unescape_string("\\070") -@test "\071" == unescape_string("\\071") -@test "\077" == unescape_string("\\077") -@test "\170" == unescape_string("\\170") -@test "\171" == unescape_string("\\171") -@test "\177" == unescape_string("\\177") -@test "\0001" == unescape_string("\\0001") -@test "\0011" == unescape_string("\\0011") -@test "\0071" == unescape_string("\\0071") -@test "\0701" == unescape_string("\\0701") -@test "\0711" == unescape_string("\\0711") -@test "\0771" == unescape_string("\\0771") -@test "\1701" == unescape_string("\\1701") -@test "\1711" == unescape_string("\\1711") -@test "\1771" == unescape_string("\\1771") - -@test "\x0" == unescape_string("\\x0") -@test "\x1" == unescape_string("\\x1") -@test "\xf" == unescape_string("\\xf") -@test "\xF" == unescape_string("\\xF") -@test "\x0x" == unescape_string("\\x0x") -@test "\x1x" == unescape_string("\\x1x") -@test "\xfx" == unescape_string("\\xfx") -@test "\xFx" == unescape_string("\\xFx") -@test "\x00" == unescape_string("\\x00") -@test "\x01" == unescape_string("\\x01") -@test "\x0f" == unescape_string("\\x0f") -@test "\x0F" == unescape_string("\\x0F") - -# integer parsing -@test is(parse(Int32,"0",36),Int32(0)) -@test is(parse(Int32,"1",36),Int32(1)) -@test is(parse(Int32,"9",36),Int32(9)) -@test is(parse(Int32,"A",36),Int32(10)) -@test is(parse(Int32,"a",36),Int32(10)) -@test is(parse(Int32,"B",36),Int32(11)) -@test is(parse(Int32,"b",36),Int32(11)) -@test is(parse(Int32,"F",36),Int32(15)) -@test is(parse(Int32,"f",36),Int32(15)) -@test is(parse(Int32,"Z",36),Int32(35)) -@test is(parse(Int32,"z",36),Int32(35)) - -@test parse(Int,"0") == 0 -@test parse(Int,"-0") == 0 -@test parse(Int,"1") == 1 -@test parse(Int,"-1") == -1 -@test parse(Int,"9") == 9 -@test parse(Int,"-9") == -9 -@test parse(Int,"10") == 10 -@test parse(Int,"-10") == -10 -@test parse(Int64,"3830974272") == 3830974272 -@test parse(Int64,"-3830974272") == -3830974272 -@test parse(Int,'3') == 3 -@test parse(Int,'3', 8) == 3 - -parsebin(s) = parse(Int,s,2) -parseoct(s) = parse(Int,s,8) -parsehex(s) = parse(Int,s,16) - -@test parsebin("0") == 0 -@test parsebin("-0") == 0 -@test parsebin("1") == 1 -@test parsebin("-1") == -1 -@test parsebin("10") == 2 -@test parsebin("-10") == -2 -@test parsebin("11") == 3 -@test parsebin("-11") == -3 -@test parsebin("1111000011110000111100001111") == 252645135 -@test parsebin("-1111000011110000111100001111") == -252645135 - -@test parseoct("0") == 0 -@test parseoct("-0") == 0 -@test parseoct("1") == 1 -@test parseoct("-1") == -1 -@test parseoct("7") == 7 -@test parseoct("-7") == -7 -@test parseoct("10") == 8 -@test parseoct("-10") == -8 -@test parseoct("11") == 9 -@test parseoct("-11") == -9 -@test parseoct("72") == 58 -@test parseoct("-72") == -58 -@test parseoct("3172207320") == 434704080 -@test parseoct("-3172207320") == -434704080 - -@test parsehex("0") == 0 -@test parsehex("-0") == 0 -@test parsehex("1") == 1 -@test parsehex("-1") == -1 -@test parsehex("9") == 9 -@test parsehex("-9") == -9 -@test parsehex("a") == 10 -@test parsehex("-a") == -10 -@test parsehex("f") == 15 -@test parsehex("-f") == -15 -@test parsehex("10") == 16 -@test parsehex("-10") == -16 -@test parsehex("0BADF00D") == 195948557 -@test parsehex("-0BADF00D") == -195948557 -@test parse(Int64,"BADCAB1E",16) == 3135023902 -@test parse(Int64,"-BADCAB1E",16) == -3135023902 -@test parse(Int64,"CafeBabe",16) == 3405691582 -@test parse(Int64,"-CafeBabe",16) == -3405691582 -@test parse(Int64,"DeadBeef",16) == 3735928559 -@test parse(Int64,"-DeadBeef",16) == -3735928559 - -@test parse(Int,"2\n") == 2 -@test parse(Int," 2 \n ") == 2 -@test parse(Int," 2 ") == 2 -@test parse(Int,"2 ") == 2 -@test parse(Int," 2") == 2 -@test parse(Int,"+2\n") == 2 -@test parse(Int,"-2") == -2 -@test_throws ArgumentError parse(Int," 2 \n 0") -@test_throws ArgumentError parse(Int,"2x") -@test_throws ArgumentError parse(Int,"-") - -# multibyte spaces -@test parse(Int, "3\u2003\u202F") == 3 -@test_throws ArgumentError parse(Int, "3\u2003\u202F,") - -@test parse(Int,'a') == 10 -@test_throws ArgumentError parse(Int,typemax(Char)) - -@test parse(Int,"1234") == 1234 -@test parse(Int,"0x1234") == 0x1234 -@test parse(Int,"0o1234") == 0o1234 -@test parse(Int,"0b1011") == 0b1011 -@test parse(Int,"-1234") == -1234 -@test parse(Int,"-0x1234") == -Int(0x1234) -@test parse(Int,"-0o1234") == -Int(0o1234) -@test parse(Int,"-0b1011") == -Int(0b1011) - -## FIXME: #4905, do these tests for Int128/UInt128! -for T in (Int8, Int16, Int32, Int64) - @test parse(T,string(typemin(T))) == typemin(T) - @test parse(T,string(typemax(T))) == typemax(T) - @test_throws OverflowError parse(T,string(big(typemin(T))-1)) - @test_throws OverflowError parse(T,string(big(typemax(T))+1)) -end - -for T in (UInt8,UInt16,UInt32,UInt64) - @test parse(T,string(typemin(T))) == typemin(T) - @test parse(T,string(typemax(T))) == typemax(T) - @test_throws ArgumentError parse(T,string(big(typemin(T))-1)) - @test_throws OverflowError parse(T,string(big(typemax(T))+1)) -end - -@test lpad("foo", 3) == "foo" -@test rpad("foo", 3) == "foo" -@test lpad("foo", 5) == " foo" -@test rpad("foo", 5) == "foo " -@test lpad("foo", 5, " ") == " foo" -@test rpad("foo", 5, " ") == "foo " -@test lpad("foo", 6, " ") == " foo" -@test rpad("foo", 6, " ") == "foo " - -# string manipulation -@test strip("\t hi \n") == "hi" -@test strip("foobarfoo", ['f', 'o']) == "bar" - -# some test strings -astr = "Hello, world.\n" -u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" - -## generic string uses only endof and next ## - -immutable GenericString <: AbstractString - string::AbstractString -end - -Base.endof(s::GenericString) = endof(s.string) -Base.next(s::GenericString, i::Int) = next(s.string, i) - -# ascii search -for str in [astr, GenericString(astr)] - @test_throws BoundsError search(str, 'z', 0) - @test_throws BoundsError search(str, '∀', 0) - @test search(str, 'x') == 0 - @test search(str, '\0') == 0 - @test search(str, '\u80') == 0 - @test search(str, '∀') == 0 - @test search(str, 'H') == 1 - @test search(str, 'l') == 3 - @test search(str, 'l', 4) == 4 - @test search(str, 'l', 5) == 11 - @test search(str, 'l', 12) == 0 - @test search(str, ',') == 6 - @test search(str, ',', 7) == 0 - @test search(str, '\n') == 14 - @test search(str, '\n', 15) == 0 - @test_throws BoundsError search(str, 'ε', nextind(str,endof(str))+1) - @test_throws BoundsError search(str, 'a', nextind(str,endof(str))+1) -end - -# ascii rsearch -for str in [astr] - @test rsearch(str, 'x') == 0 - @test rsearch(str, '\0') == 0 - @test rsearch(str, '\u80') == 0 - @test rsearch(str, '∀') == 0 - @test rsearch(str, 'H') == 1 - @test rsearch(str, 'H', 0) == 0 - @test rsearch(str, 'l') == 11 - @test rsearch(str, 'l', 5) == 4 - @test rsearch(str, 'l', 4) == 4 - @test rsearch(str, 'l', 3) == 3 - @test rsearch(str, 'l', 2) == 0 - @test rsearch(str, ',') == 6 - @test rsearch(str, ',', 5) == 0 - @test rsearch(str, '\n') == 14 -end - -# utf-8 search -for str in (u8str, GenericString(u8str)) - @test_throws BoundsError search(str, 'z', 0) - @test_throws BoundsError search(str, '∀', 0) - @test search(str, 'z') == 0 - @test search(str, '\0') == 0 - @test search(str, '\u80') == 0 - @test search(str, '∄') == 0 - @test search(str, '∀') == 1 - @test_throws UnicodeError search(str, '∀', 2) - @test search(str, '∀', 4) == 0 - @test search(str, '∃') == 13 - @test_throws UnicodeError search(str, '∃', 15) - @test search(str, '∃', 16) == 0 - @test search(str, 'x') == 26 - @test search(str, 'x', 27) == 43 - @test search(str, 'x', 44) == 0 - @test search(str, 'δ') == 17 - @test_throws UnicodeError search(str, 'δ', 18) - @test search(str, 'δ', nextind(str,17)) == 33 - @test search(str, 'δ', nextind(str,33)) == 0 - @test search(str, 'ε') == 5 - @test search(str, 'ε', nextind(str,5)) == 54 - @test search(str, 'ε', nextind(str,54)) == 0 - @test search(str, 'ε', nextind(str,endof(str))) == 0 - @test search(str, 'a', nextind(str,endof(str))) == 0 - @test_throws BoundsError search(str, 'ε', nextind(str,endof(str))+1) - @test_throws BoundsError search(str, 'a', nextind(str,endof(str))+1) -end - -# utf-8 rsearch -for str in [u8str] - @test rsearch(str, 'z') == 0 - @test rsearch(str, '\0') == 0 - @test rsearch(str, '\u80') == 0 - @test rsearch(str, '∄') == 0 - @test rsearch(str, '∀') == 1 - @test rsearch(str, '∀', 0) == 0 - @test rsearch(str, '∃') == 13 - @test rsearch(str, '∃', 14) == 13 - @test rsearch(str, '∃', 13) == 13 - @test rsearch(str, '∃', 12) == 0 - @test rsearch(str, 'x') == 43 - @test rsearch(str, 'x', 42) == 26 - @test rsearch(str, 'x', 25) == 0 - @test rsearch(str, 'δ') == 33 - @test rsearch(str, 'δ', 32) == 17 - @test rsearch(str, 'δ', 16) == 0 - @test rsearch(str, 'ε') == 54 - @test rsearch(str, 'ε', 53) == 5 - @test rsearch(str, 'ε', 4) == 0 -end - -# string search with a single-char string -@test search(astr, "x") == 0:-1 -@test search(astr, "H") == 1:1 -@test search(astr, "H", 2) == 0:-1 -@test search(astr, "l") == 3:3 -@test search(astr, "l", 4) == 4:4 -@test search(astr, "l", 5) == 11:11 -@test search(astr, "l", 12) == 0:-1 -@test search(astr, "\n") == 14:14 -@test search(astr, "\n", 15) == 0:-1 - -@test search(u8str, "z") == 0:-1 -@test search(u8str, "∄") == 0:-1 -@test search(u8str, "∀") == 1:1 -@test search(u8str, "∀", 4) == 0:-1 -@test search(u8str, "∃") == 13:13 -@test search(u8str, "∃", 16) == 0:-1 -@test search(u8str, "x") == 26:26 -@test search(u8str, "x", 27) == 43:43 -@test search(u8str, "x", 44) == 0:-1 -@test search(u8str, "ε") == 5:5 -@test search(u8str, "ε", 7) == 54:54 -@test search(u8str, "ε", 56) == 0:-1 - -# string rsearch with a single-char string -@test rsearch(astr, "x") == 0:-1 -@test rsearch(astr, "H") == 1:1 -@test rsearch(astr, "H", 2) == 1:1 -@test rsearch(astr, "H", 0) == 0:-1 -@test rsearch(astr, "l") == 11:11 -@test rsearch(astr, "l", 10) == 4:4 -@test rsearch(astr, "l", 4) == 4:4 -@test rsearch(astr, "l", 3) == 3:3 -@test rsearch(astr, "l", 2) == 0:-1 -@test rsearch(astr, "\n") == 14:14 -@test rsearch(astr, "\n", 13) == 0:-1 - -@test rsearch(u8str, "z") == 0:-1 -@test rsearch(u8str, "∄") == 0:-1 -@test rsearch(u8str, "∀") == 1:1 -@test rsearch(u8str, "∀", 0) == 0:-1 -#TODO: setting the limit in the middle of a wide char -# makes search fail but rsearch succeed. -# Should rsearch fail as well? -#@test rsearch(u8str, "∀", 2) == 0:-1 # gives 1:3 -@test rsearch(u8str, "∃") == 13:13 -@test rsearch(u8str, "∃", 12) == 0:-1 -@test rsearch(u8str, "x") == 43:43 -@test rsearch(u8str, "x", 42) == 26:26 -@test rsearch(u8str, "x", 25) == 0:-1 -@test rsearch(u8str, "ε") == 54:54 -@test rsearch(u8str, "ε", 53) == 5:5 -@test rsearch(u8str, "ε", 4) == 0:-1 - -# string search with a single-char regex -@test search(astr, r"x") == 0:-1 -@test search(astr, r"H") == 1:1 -@test search(astr, r"H", 2) == 0:-1 -@test search(astr, r"l") == 3:3 -@test search(astr, r"l", 4) == 4:4 -@test search(astr, r"l", 5) == 11:11 -@test search(astr, r"l", 12) == 0:-1 -@test search(astr, r"\n") == 14:14 -@test search(astr, r"\n", 15) == 0:-1 -@test search(u8str, r"z") == 0:-1 -@test search(u8str, r"∄") == 0:-1 -@test search(u8str, r"∀") == 1:1 -@test search(u8str, r"∀", 4) == 0:-1 -@test search(u8str, r"∀") == search(u8str, r"\u2200") -@test search(u8str, r"∀", 4) == search(u8str, r"\u2200", 4) -@test search(u8str, r"∃") == 13:13 -@test search(u8str, r"∃", 16) == 0:-1 -@test search(u8str, r"x") == 26:26 -@test search(u8str, r"x", 27) == 43:43 -@test search(u8str, r"x", 44) == 0:-1 -@test search(u8str, r"ε") == 5:5 -@test search(u8str, r"ε", 7) == 54:54 -@test search(u8str, r"ε", 56) == 0:-1 -for i = 1:endof(astr) - @test search(astr, r"."s, i) == i:i -end -for i = 1:endof(u8str) - if isvalid(u8str,i) - @test search(u8str, r"."s, i) == i:i - end -end - -# string search with a zero-char string -for i = 1:endof(astr) - @test search(astr, "", i) == i:i-1 -end -for i = 1:endof(u8str) - @test search(u8str, "", i) == i:i-1 -end -@test search("", "") == 1:0 - -# string rsearch with a zero-char string -for i = 1:endof(astr) - @test rsearch(astr, "", i) == i:i-1 -end -for i = 1:endof(u8str) - @test rsearch(u8str, "", i) == i:i-1 -end -@test rsearch("", "") == 1:0 - -# string search with a zero-char regex -for i = 1:endof(astr) - @test search(astr, r"", i) == i:i-1 -end -for i = 1:endof(u8str) - # TODO: should regex search fast-forward invalid indices? - if isvalid(u8str,i) - @test search(u8str, r""s, i) == i:i-1 - end -end - -# string search with a two-char string literal -@test search("foo,bar,baz", "xx") == 0:-1 -@test search("foo,bar,baz", "fo") == 1:2 -@test search("foo,bar,baz", "fo", 3) == 0:-1 -@test search("foo,bar,baz", "oo") == 2:3 -@test search("foo,bar,baz", "oo", 4) == 0:-1 -@test search("foo,bar,baz", "o,") == 3:4 -@test search("foo,bar,baz", "o,", 5) == 0:-1 -@test search("foo,bar,baz", ",b") == 4:5 -@test search("foo,bar,baz", ",b", 6) == 8:9 -@test search("foo,bar,baz", ",b", 10) == 0:-1 -@test search("foo,bar,baz", "az") == 10:11 -@test search("foo,bar,baz", "az", 12) == 0:-1 - -# issue #9365 -# string search with a two-char UTF-8 (2 byte) string literal -@test search("ééé", "éé") == 1:3 -@test search("ééé", "éé", 1) == 1:3 -# string search with a two-char UTF-8 (3 byte) string literal -@test search("€€€", "€€") == 1:4 -@test search("€€€", "€€", 1) == 1:4 -# string search with a two-char UTF-8 (4 byte) string literal -@test search("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 1:5 -@test search("\U1f596\U1f596\U1f596", "\U1f596\U1f596", 1) == 1:5 - -# string search with a two-char UTF-8 (2 byte) string literal -@test search("éé", "éé") == 1:3 -@test search("éé", "éé", 1) == 1:3 -# string search with a two-char UTF-8 (3 byte) string literal -@test search("€€", "€€") == 1:4 -@test search("€€", "€€", 1) == 1:4 -# string search with a two-char UTF-8 (4 byte) string literal -@test search("\U1f596\U1f596", "\U1f596\U1f596") == 1:5 -@test search("\U1f596\U1f596", "\U1f596\U1f596", 1) == 1:5 - -# string rsearch with a two-char UTF-8 (2 byte) string literal -@test rsearch("ééé", "éé") == 3:5 -@test rsearch("ééé", "éé", endof("ééé")) == 3:5 -# string rsearch with a two-char UTF-8 (3 byte) string literal -@test rsearch("€€€", "€€") == 4:7 -@test rsearch("€€€", "€€", endof("€€€")) == 4:7 -# string rsearch with a two-char UTF-8 (4 byte) string literal -@test rsearch("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 5:9 -@test rsearch("\U1f596\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 5:9 - -# string rsearch with a two-char UTF-8 (2 byte) string literal -@test rsearch("éé", "éé") == 1:3 # should really be 1:4! -@test rsearch("éé", "éé", endof("ééé")) == 1:3 -# string search with a two-char UTF-8 (3 byte) string literal -@test rsearch("€€", "€€") == 1:4 # should really be 1:6! -@test rsearch("€€", "€€", endof("€€€")) == 1:4 -# string search with a two-char UTF-8 (4 byte) string literal -@test rsearch("\U1f596\U1f596", "\U1f596\U1f596") == 1:5 # should really be 1:8! -@test rsearch("\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 1:5 - -# string rsearch with a two-char string literal -@test rsearch("foo,bar,baz", "xx") == 0:-1 -@test rsearch("foo,bar,baz", "fo") == 1:2 -@test rsearch("foo,bar,baz", "fo", 1) == 0:-1 -@test rsearch("foo,bar,baz", "oo") == 2:3 -@test rsearch("foo,bar,baz", "oo", 2) == 0:-1 -@test rsearch("foo,bar,baz", "o,") == 3:4 -@test rsearch("foo,bar,baz", "o,", 1) == 0:-1 -@test rsearch("foo,bar,baz", ",b") == 8:9 -@test rsearch("foo,bar,baz", ",b", 6) == 4:5 -@test rsearch("foo,bar,baz", ",b", 3) == 0:-1 -@test rsearch("foo,bar,baz", "az") == 10:11 -@test rsearch("foo,bar,baz", "az", 10) == 0:-1 - -# array rsearch -@test rsearch(UInt8[1,2,3],UInt8[2,3],3) == 2:3 -@test rsearch(UInt8[1,2,3],UInt8[2,3],1) == 0:-1 - -# string search with a two-char regex -@test search("foo,bar,baz", r"xx") == 0:-1 -@test search("foo,bar,baz", r"fo") == 1:2 -@test search("foo,bar,baz", r"fo", 3) == 0:-1 -@test search("foo,bar,baz", r"oo") == 2:3 -@test search("foo,bar,baz", r"oo", 4) == 0:-1 -@test search("foo,bar,baz", r"o,") == 3:4 -@test search("foo,bar,baz", r"o,", 5) == 0:-1 -@test search("foo,bar,baz", r",b") == 4:5 -@test search("foo,bar,baz", r",b", 6) == 8:9 -@test search("foo,bar,baz", r",b", 10) == 0:-1 -@test search("foo,bar,baz", r"az") == 10:11 -@test search("foo,bar,baz", r"az", 12) == 0:-1 - -@test searchindex("foo", 'o') == 2 -@test searchindex("foo", 'o', 3) == 3 - -# string searchindex with a two-char UTF-8 (2 byte) string literal -@test searchindex("ééé", "éé") == 1 -@test searchindex("ééé", "éé", 1) == 1 -# string searchindex with a two-char UTF-8 (3 byte) string literal -@test searchindex("€€€", "€€") == 1 -@test searchindex("€€€", "€€", 1) == 1 -# string searchindex with a two-char UTF-8 (4 byte) string literal -@test searchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 1 -@test searchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596", 1) == 1 - -# string searchindex with a two-char UTF-8 (2 byte) string literal -@test searchindex("éé", "éé") == 1 -@test searchindex("éé", "éé", 1) == 1 -# string searchindex with a two-char UTF-8 (3 byte) string literal -@test searchindex("€€", "€€") == 1 -@test searchindex("€€", "€€", 1) == 1 -# string searchindex with a two-char UTF-8 (4 byte) string literal -@test searchindex("\U1f596\U1f596", "\U1f596\U1f596") == 1 -@test searchindex("\U1f596\U1f596", "\U1f596\U1f596", 1) == 1 - -# string rsearchindex with a two-char UTF-8 (2 byte) string literal -@test rsearchindex("ééé", "éé") == 3 -@test rsearchindex("ééé", "éé", endof("ééé")) == 3 -# string rsearchindex with a two-char UTF-8 (3 byte) string literal -@test rsearchindex("€€€", "€€") == 4 -@test rsearchindex("€€€", "€€", endof("€€€")) == 4 -# string rsearchindex with a two-char UTF-8 (4 byte) string literal -@test rsearchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 5 -@test rsearchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 5 - -# string rsearchindex with a two-char UTF-8 (2 byte) string literal -@test rsearchindex("éé", "éé") == 1 -@test rsearchindex("éé", "éé", endof("ééé")) == 1 -# string searchindex with a two-char UTF-8 (3 byte) string literal -@test rsearchindex("€€", "€€") == 1 -@test rsearchindex("€€", "€€", endof("€€€")) == 1 -# string searchindex with a two-char UTF-8 (4 byte) string literal -@test rsearchindex("\U1f596\U1f596", "\U1f596\U1f596") == 1 -@test rsearchindex("\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 1 - -# split -@test isequal(split("foo,bar,baz", 'x'), ["foo,bar,baz"]) -@test isequal(split("foo,bar,baz", ','), ["foo","bar","baz"]) -@test isequal(split("foo,bar,baz", ","), ["foo","bar","baz"]) -@test isequal(split("foo,bar,baz", r","), ["foo","bar","baz"]) -@test isequal(split("foo,bar,baz", ','; limit=0), ["foo","bar","baz"]) -@test isequal(split("foo,bar,baz", ','; limit=1), ["foo,bar,baz"]) -@test isequal(split("foo,bar,baz", ','; limit=2), ["foo","bar,baz"]) -@test isequal(split("foo,bar,baz", ','; limit=3), ["foo","bar","baz"]) -@test isequal(split("foo,bar", "o,b"), ["fo","ar"]) - -@test isequal(split("", ','), [""]) -@test isequal(split(",", ','), ["",""]) -@test isequal(split(",,", ','), ["","",""]) -@test isequal(split("", ',' ; keep=false), []) -@test isequal(split(",", ',' ; keep=false), []) -@test isequal(split(",,", ','; keep=false), []) - -@test isequal(split("a b c"), ["a","b","c"]) -@test isequal(split("a b \t c\n"), ["a","b","c"]) - -@test isequal(rsplit("foo,bar,baz", 'x'), ["foo,bar,baz"]) -@test isequal(rsplit("foo,bar,baz", ','), ["foo","bar","baz"]) -@test isequal(rsplit("foo,bar,baz", ","), ["foo","bar","baz"]) -@test isequal(rsplit("foo,bar,baz", ','; limit=0), ["foo","bar","baz"]) -@test isequal(rsplit("foo,bar,baz", ','; limit=1), ["foo,bar,baz"]) -@test isequal(rsplit("foo,bar,baz", ','; limit=2), ["foo,bar","baz"]) -@test isequal(rsplit("foo,bar,baz", ','; limit=3), ["foo","bar","baz"]) -@test isequal(rsplit("foo,bar", "o,b"), ["fo","ar"]) - -@test isequal(rsplit("", ','), [""]) -@test isequal(rsplit(",", ','), ["",""]) -@test isequal(rsplit(",,", ','), ["","",""]) -@test isequal(rsplit(",,", ','; limit=2), [",",""]) -@test isequal(rsplit("", ',' ; keep=false), []) -@test isequal(rsplit(",", ',' ; keep=false), []) -@test isequal(rsplit(",,", ','; keep=false), []) - -#@test isequal(rsplit("a b c"), ["a","b","c"]) -#@test isequal(rsplit("a b \t c\n"), ["a","b","c"]) - -let str = "a.:.ba..:..cba.:.:.dcba.:." -@test isequal(split(str, ".:."), ["a","ba.",".cba",":.dcba",""]) -@test isequal(split(str, ".:."; keep=false), ["a","ba.",".cba",":.dcba"]) -@test isequal(split(str, ".:."), ["a","ba.",".cba",":.dcba",""]) -@test isequal(split(str, r"\.(:\.)+"), ["a","ba.",".cba","dcba",""]) -@test isequal(split(str, r"\.(:\.)+"; keep=false), ["a","ba.",".cba","dcba"]) -@test isequal(split(str, r"\.+:\.+"), ["a","ba","cba",":.dcba",""]) -@test isequal(split(str, r"\.+:\.+"; keep=false), ["a","ba","cba",":.dcba"]) - -@test isequal(rsplit(str, ".:."), ["a","ba.",".cba.:","dcba",""]) -@test isequal(rsplit(str, ".:."; keep=false), ["a","ba.",".cba.:","dcba"]) -@test isequal(rsplit(str, ".:."; limit=2), ["a.:.ba..:..cba.:.:.dcba", ""]) -@test isequal(rsplit(str, ".:."; limit=3), ["a.:.ba..:..cba.:", "dcba", ""]) -@test isequal(rsplit(str, ".:."; limit=4), ["a.:.ba.", ".cba.:", "dcba", ""]) -@test isequal(rsplit(str, ".:."; limit=5), ["a", "ba.", ".cba.:", "dcba", ""]) -@test isequal(rsplit(str, ".:."; limit=6), ["a", "ba.", ".cba.:", "dcba", ""]) -end - -# zero-width splits -@test isequal(rsplit("", ""), [""]) - -@test isequal(split("", ""), [""]) -@test isequal(split("", r""), [""]) -@test isequal(split("abc", ""), ["a","b","c"]) -@test isequal(split("abc", r""), ["a","b","c"]) -@test isequal(split("abcd", r"b?"), ["a","c","d"]) -@test isequal(split("abcd", r"b*"), ["a","c","d"]) -@test isequal(split("abcd", r"b+"), ["a","cd"]) -@test isequal(split("abcd", r"b?c?"), ["a","d"]) -@test isequal(split("abcd", r"[bc]?"), ["a","","d"]) -@test isequal(split("abcd", r"a*"), ["","b","c","d"]) -@test isequal(split("abcd", r"a+"), ["","bcd"]) -@test isequal(split("abcd", r"d*"), ["a","b","c",""]) -@test isequal(split("abcd", r"d+"), ["abc",""]) -@test isequal(split("abcd", r"[ad]?"), ["","b","c",""]) - -# replace -@test replace("foobar", 'o', '0') == "f00bar" -@test replace("foobar", 'o', '0', 1) == "f0obar" -@test replace("foobar", 'o', "") == "fbar" -@test replace("foobar", 'o', "", 1) == "fobar" -@test replace("foobar", 'f', 'F') == "Foobar" -@test replace("foobar", 'r', 'R') == "foobaR" - -@test replace("foofoofoo", "foo", "bar") == "barbarbar" -@test replace("foobarfoo", "foo", "baz") == "bazbarbaz" -@test replace("barfoofoo", "foo", "baz") == "barbazbaz" - -@test replace("", "", "") == "" -@test replace("", "", "x") == "x" -@test replace("", "x", "y") == "" - -@test replace("abcd", "", "^") == "^a^b^c^d^" -@test replace("abcd", "b", "^") == "a^cd" -@test replace("abcd", r"b?", "^") == "^a^c^d^" -@test replace("abcd", r"b+", "^") == "a^cd" -@test replace("abcd", r"b?c?", "^") == "^a^d^" -@test replace("abcd", r"[bc]?", "^") == "^a^^d^" - -@test replace("foobarfoo", r"(fo|ba)", "xx") == "xxoxxrxxo" -@test replace("foobarfoo", r"(foo|ba)", "bar") == "barbarrbar" - -@test replace("foobar", 'o', 'ø') == "føøbar" -@test replace("foobar", 'o', 'ø', 1) == "føobar" -@test replace("føøbar", 'ø', 'o') == "foobar" -@test replace("føøbar", 'ø', 'o', 1) == "foøbar" -@test replace("føøbar", 'ø', 'ö') == "fööbar" -@test replace("føøbar", 'ø', 'ö', 1) == "föøbar" -@test replace("føøbar", 'ø', "") == "fbar" -@test replace("føøbar", 'ø', "", 1) == "føbar" -@test replace("føøbar", 'f', 'F') == "Føøbar" -@test replace("ḟøøbar", 'ḟ', 'F') == "Føøbar" -@test replace("føøbar", 'f', 'Ḟ') == "Ḟøøbar" -@test replace("ḟøøbar", 'ḟ', 'Ḟ') == "Ḟøøbar" -@test replace("føøbar", 'r', 'R') == "føøbaR" -@test replace("føøbaṙ", 'ṙ', 'R') == "føøbaR" -@test replace("føøbar", 'r', 'Ṙ') == "føøbaṘ" -@test replace("føøbaṙ", 'ṙ', 'Ṙ') == "føøbaṘ" - -@test replace("ḟøøḟøøḟøø", "ḟøø", "bar") == "barbarbar" -@test replace("ḟøøbarḟøø", "ḟøø", "baz") == "bazbarbaz" -@test replace("barḟøøḟøø", "ḟøø", "baz") == "barbazbaz" - -@test replace("foofoofoo", "foo", "ƀäṙ") == "ƀäṙƀäṙƀäṙ" -@test replace("fooƀäṙfoo", "foo", "baz") == "bazƀäṙbaz" -@test replace("ƀäṙfoofoo", "foo", "baz") == "ƀäṙbazbaz" - -@test replace("foofoofoo", "foo", "bar") == "barbarbar" -@test replace("foobarfoo", "foo", "ƀäż") == "ƀäżbarƀäż" -@test replace("barfoofoo", "foo", "ƀäż") == "barƀäżƀäż" - -@test replace("ḟøøḟøøḟøø", "ḟøø", "ƀäṙ") == "ƀäṙƀäṙƀäṙ" -@test replace("ḟøøƀäṙḟøø", "ḟøø", "baz") == "bazƀäṙbaz" -@test replace("ƀäṙḟøøḟøø", "ḟøø", "baz") == "ƀäṙbazbaz" - -@test replace("ḟøøḟøøḟøø", "ḟøø", "bar") == "barbarbar" -@test replace("ḟøøbarḟøø", "ḟøø", "ƀäż") == "ƀäżbarƀäż" -@test replace("barḟøøḟøø", "ḟøø", "ƀäż") == "barƀäżƀäż" - -@test replace("ḟøøḟøøḟøø", "ḟøø", "ƀäṙ") == "ƀäṙƀäṙƀäṙ" -@test replace("ḟøøƀäṙḟøø", "ḟøø", "ƀäż") == "ƀäżƀäṙƀäż" -@test replace("ƀäṙḟøøḟøø", "ḟøø", "ƀäż") == "ƀäṙƀäżƀäż" - -@test replace("", "", "ẍ") == "ẍ" -@test replace("", "ẍ", "ÿ") == "" - -@test replace("äƀçđ", "", "π") == "πäπƀπçπđπ" -@test replace("äƀçđ", "ƀ", "π") == "äπçđ" -@test replace("äƀçđ", r"ƀ?", "π") == "πäπçπđπ" -@test replace("äƀçđ", r"ƀ+", "π") == "äπçđ" -@test replace("äƀçđ", r"ƀ?ç?", "π") == "πäπđπ" -@test replace("äƀçđ", r"[ƀç]?", "π") == "πäππđπ" - -@test replace("foobarfoo", r"(fo|ba)", "ẍẍ") == "ẍẍoẍẍrẍẍo" - -@test replace("ḟøøbarḟøø", r"(ḟø|ba)", "xx") == "xxøxxrxxø" -@test replace("ḟøøbarḟøø", r"(ḟøø|ba)", "bar") == "barbarrbar" - -@test replace("fooƀäṙfoo", r"(fo|ƀä)", "xx") == "xxoxxṙxxo" -@test replace("fooƀäṙfoo", r"(foo|ƀä)", "ƀäṙ") == "ƀäṙƀäṙṙƀäṙ" - -@test replace("ḟøøƀäṙḟøø", r"(ḟø|ƀä)", "xx") == "xxøxxṙxxø" -@test replace("ḟøøƀäṙḟøø", r"(ḟøø|ƀä)", "ƀäṙ") == "ƀäṙƀäṙṙƀäṙ" - -@test replace("foo", "oo", uppercase) == "fOO" - -# chomp/chop -@test chomp("foo\n") == "foo" -@test chop("foob") == "foo" - -# lower and upper -@test uppercase("aBc") == "ABC" -@test uppercase('A') == 'A' -@test uppercase('a') == 'A' -@test lowercase("AbC") == "abc" -@test lowercase('A') == 'a' -@test lowercase('a') == 'a' -@test uppercase('α') == '\u0391' -@test lowercase('Δ') == 'δ' -@test lowercase('\U118bf') == '\U118df' -@test uppercase('\U1044d') == '\U10425' -@test ucfirst("Abc") == "Abc" -@test ucfirst("abc") == "Abc" -@test lcfirst("ABC") == "aBC" -@test lcfirst("aBC") == "aBC" - -# {starts,ends}with -@test startswith("abcd", 'a') -@test startswith("abcd", "a") -@test startswith("abcd", "ab") -@test !startswith("ab", "abcd") -@test !startswith("abcd", "bc") -@test endswith("abcd", 'd') -@test endswith("abcd", "d") -@test endswith("abcd", "cd") -@test !endswith("abcd", "dc") -@test !endswith("cd", "abcd") - -@test filter(x -> x ∈ ['f', 'o'], "foobar") == "foo" - -# RepStrings and SubStrings -u8str2 = u8str^2 -len_u8str = length(u8str) -slen_u8str = length(u8str) -len_u8str2 = length(u8str2) -slen_u8str2 = length(u8str2) - -@test len_u8str2 == 2 * len_u8str -@test slen_u8str2 == 2 * slen_u8str - -u8str2plain = utf8(u8str2) - -for i1 = 1:length(u8str2) - if !isvalid(u8str2, i1); continue; end - for i2 = i1:length(u8str2) - if !isvalid(u8str2, i2); continue; end - @test length(u8str2[i1:i2]) == length(u8str2plain[i1:i2]) - @test length(u8str2[i1:i2]) == length(u8str2plain[i1:i2]) - @test u8str2[i1:i2] == u8str2plain[i1:i2] - end -end - -str="tempus fugit" #length(str)==12 -ss=SubString(str,1,length(str)) #match source string -@test length(ss)==length(str) - -ss=SubString(str,1,0) #empty SubString -@test length(ss)==0 - -ss=SubString(str,14,20) #start indexed beyond source string length -@test length(ss)==0 - -ss=SubString(str,10,16) #end indexed beyond source string length -@test length(ss)==3 - -str2="" -ss=SubString(str2,1,4) #empty source string -@test length(ss)==0 - -ss=SubString(str2,1,1) #empty source string, identical start and end index -@test length(ss)==0 - -@test SubString("foobar",big(1),big(3)) == "foo" - -str = "aa\u2200\u2222bb" -u = SubString(str, 3, 6) -@test length(u)==2 -b = IOBuffer() -write(b, u) -@test takebuf_string(b) == "\u2200\u2222" - -str = "føøbar" -u = SubString(str, 4, 3) -@test length(u)==0 -b = IOBuffer() -write(b, u) -@test takebuf_string(b) == "" - -str = "føøbar" -u = SubString(str, 10, 10) -@test length(u)==0 -b = IOBuffer() -write(b, u) -@test takebuf_string(b) == "" - -@test replace("\u2202", '*', '\0') == "\u2202" - -# search and SubString (issue #5679) -str = "Hello, world!" -u = SubString(str, 1, 5) -@test rsearch(u, "World") == 0:-1 -@test rsearch(u, 'z') == 0 -@test rsearch(u, "ll") == 3:4 - -# quotes + interpolation (issue #455) -@test "$("string")" == "string" -arr = ["a","b","c"] -@test "[$(join(arr, " - "))]" == "[a - b - c]" - -# string iteration, and issue #1454 -str = "é" -str_a = vcat(str...) -@test length(str_a)==1 -@test str_a[1] == str[1] - -str = "s\u2200" -@test str[1:end] == str - -# triple-quote delimited strings -@test """abc""" == "abc" -@test """ab"c""" == "ab\"c" -@test """ab""c""" == "ab\"\"c" -@test """ab"\"c""" == "ab\"\"c" -@test """abc\"""" == "abc\"" -n = 3 -@test """$n\n""" == "$n\n" -@test """$(n)""" == "3" -@test """$(2n)""" == "6" -@test """$(n+4)""" == "7" -@test """$("string")""" == "string" -a = [3,1,2] -@test """$(a[2])""" == "1" -@test """$(a[3]+7)""" == "9" -@test """$(floor(Int,4.5))""" == "4" -nl = " -" -@test """ - a - b - - c - """ == "a$(nl)b$(nl)$(nl)c$(nl)" -@test """ - """ == "" -@test """x - a - """ == "x$(nl) a$(nl)" -@test """ - $n - """ == " $n$(nl)" -@test """ - a - b - c""" == " a$(nl)b$(nl) c" -# tabs + spaces -@test """ - a - b - """ == " a$(nl) b$(nl)" -@test """ - a - """ == "a$(nl) " -s = " p" -@test """ - $s""" == "$s" -@test """ - $s - """ == " $s$(nl)" -@test """\t""" == "\t" -@test """ - \t""" == "" -@test """ - foo - \tbar""" == "foo$(nl)\tbar" -@test """ - foo - \tbar - """ == "foo$(nl)\tbar$(nl)" -@test """ - foo - bar\t""" == "foo$(nl)bar\t" -@test """ - $("\n ") - """ == "\n $(nl)" - -# bytes2hex and hex2bytes -hex_str = "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592" -bin_val = hex2bytes(hex_str) - -@test div(length(hex_str), 2) == length(bin_val) -@test hex_str == bytes2hex(bin_val) - -bin_val = hex2bytes("07bf") -@test bin_val[1] == 7 -@test bin_val[2] == 191 -@test typeof(bin_val) == Array{UInt8, 1} -@test length(bin_val) == 2 - -# all valid hex chars -@test "0123456789abcdefabcdef" == bytes2hex(hex2bytes("0123456789abcdefABCDEF")) - -# odd size -@test_throws ArgumentError hex2bytes("0123456789abcdefABCDEF0") - -#non-hex characters -@test_throws ArgumentError hex2bytes("0123456789abcdefABCDEFGH") - -# sizeof -@test sizeof("abc") == 3 -@test sizeof("\u2222") == 3 -@test sizeof(SubString("abc\u2222def",4,4)) == 3 -@test sizeof(RopeString("abc","def")) == 6 - -# issue #3597 -@test string(utf32(['T', 'e', 's', 't'])[1:1], "X") == "TX" - -# issue #3710 -@test prevind(SubString("{var}",2,4),4) == 3 - -# printf -# int -@test (@sprintf "%d" typemax(Int64)) == "9223372036854775807" -@test (@sprintf "%i" 42) == "42" -@test (@sprintf "%u" 42) == "42" -@test (@sprintf "Test: %i" 42) == "Test: 42" -@test (@sprintf "%#x" 42) == "0x2a" -@test (@sprintf "%#o" 42) == "052" -@test (@sprintf "%X" 42) == "2A" -@test (@sprintf "%X" 42) == "2A" -@test (@sprintf "% i" 42) == " 42" -@test (@sprintf "%+i" 42) == "+42" -@test (@sprintf "%4i" 42) == " 42" -@test (@sprintf "%-4i" 42) == "42 " -# float -@test (@sprintf "%7.2f" 1.2345) == " 1.23" -@test (@sprintf "%-7.2f" 1.2345) == "1.23 " -@test (@sprintf "%07.2f" 1.2345) == "0001.23" -@test (@sprintf "%.0f" 1.2345) == "1" -@test (@sprintf "%#.0f" 1.2345) == "1." -# Inf / NaN handling -@test (@sprintf "%f" Inf) == "Inf" -@test (@sprintf "%f" NaN) == "NaN" -# scientific notation -@test (@sprintf "%.4e" 1.2345) == "1.2345e+00" -@test (@sprintf "%.0e" 3e142) == "3e+142" -@test (@sprintf "%#.0e" 3e142) == "3.e+142" -# hex float -@test (@sprintf "%a" 1.5) == "0x1.8p+0" -@test (@sprintf "%#.0a" 1.5) == "0x2.p+0" -@test (@sprintf "%+30a" 1/3) == " +0x1.5555555555555p-2" -# chars -@test (@sprintf "%c" 65) == "A" -@test (@sprintf "%c" 'A') == "A" -@test (@sprintf "%c" 248) == "ø" -@test (@sprintf "%c" 'ø') == "ø" -# strings -@test (@sprintf "%s" "test") == "test" -@test (@sprintf "%s" "tést") == "tést" -# reasonably complex -@test (@sprintf "Test: %s%c%C%c%#-.0f." "t" 65 66 67 -42) == "Test: tABC-42.." -#test simple splatting -@test (@sprintf "%d%d" [1 2]...) == "12" -# combo -@test (@sprintf "%f %d %d %f" 1.0 [3 4]... 5) == "1.000000 3 4 5.000000" -# multi -@test (@sprintf "%s %f %9.5f %d %d %d %d%d%d%d" [1:6;]... [7,8,9,10]...) == "1 2.000000 3.00000 4 5 6 78910" -# comprehension -@test (@sprintf "%s %s %s %d %d %d %f %f %f" Any[10^x+y for x=1:3,y=1:3 ]...) == "11 101 1001 12 102 1002 13.000000 103.000000 1003.000000" - -# issue #4183 -@test split(SubString(ascii("x"), 2, 0), "y") == AbstractString[""] -@test split(SubString(utf8("x"), 2, 0), "y") == AbstractString[""] - -# issue #4586 -@test rsplit(RevString("ailuj"),'l') == ["ju","ia"] -@test parse(Float64,RevString("64")) === 46.0 - -# issue #6772 -@test float(SubString("10",1,1)) === 1.0 -@test float(SubString("1 0",1,1)) === 1.0 -@test parse(Float32,SubString("10",1,1)) === 1.0f0 - -for T = (UInt8,Int8,UInt16,Int16,UInt32,Int32,UInt64,Int64,UInt128,Int128,BigInt), - b = 2:62, _ = 1:10 - n = T != BigInt ? rand(T) : BigInt(rand(Int128)) - @test parse(T,base(b,n),b) == n -end - -# normalize_string (Unicode normalization etc.): -@test normalize_string("\u006e\u0303", :NFC) == "\u00f1" -@test "\u006e\u0303" == normalize_string("\u00f1", :NFD) -@test normalize_string("\ufb00", :NFC) != "ff" -@test normalize_string("\ufb00", :NFKC) == "ff" -@test normalize_string("\u006e\u0303\ufb00", :NFKC) == "\u00f1"*"ff" -@test normalize_string("\u00f1\ufb00", :NFKD) == "\u006e\u0303"*"ff" -@test normalize_string("\u006e\u0303", compose=true) == "\u00f1" -@test "\u006e\u0303" == normalize_string("\u00f1", decompose=true) -@test normalize_string("\u006e\u0303\u00b5",compat=true) == "\u00f1\u03bc" -@test normalize_string("Σσς",casefold=true) == "σσσ" -@test normalize_string("∕⁄", lump=true) == "//" -@test normalize_string("\ua\n\r\r\ua", newline2lf=true) == "\ua\ua\ua\ua" -@test normalize_string("\ua\n\r\r\ua", newline2ls=true) == "\u2028\u2028\u2028\u2028" -@test normalize_string("\ua\n\r\r\ua", newline2ps=true) == "\u2029\u2029\u2029\u2029" -@test normalize_string("\u00f1", stripmark=true) == "n" -@test isempty(normalize_string("\u00ad", stripignore=true)) -@test normalize_string("\t\r", stripcc=true) == " " -@test normalize_string("\t\r", stripcc=true, newline2ls=true) == " \u2028" - -#Tests from Unicode SA#15, "Unicode normalization forms" -#http://www.unicode.org/reports/tr15/ - -#1. Canonical equivalence -let ==(a::Array{Char},b::Array{Char}) = normalize_string(string(a...), :NFC)==normalize_string(string(b...), :NFC) - ==(a,b) = Base.(:(==))(a,b) - @test ['C', '̧'] == ['Ç'] - @test ['q', '̇', '̣'] == ['q', '̣', '̇'] - @test ['가'] == ['ᄀ', 'ᅡ'] - @test ['Ω'] == ['Ω'] -end - -#2. Compatibility Equivalence -let ==(a::Array{Char},b::Array{Char}) = normalize_string(string(a...), :NFKC)==normalize_string(string(b...), :NFKC) - ==(a,b) = Base.(:(==))(a,b) - @test ['ℌ'] == ['ℍ'] == ['H'] - @test ['ﻨ'] == ['ﻧ'] == ['ﻦ'] == ['ﻥ'] - @test ['①'] == ['1'] - @test ['カ'] == ['カ'] - @test ['︷'] == ['{'] - @test ['⁹'] == ['₉'] - @test ['㌀'] == ['ア', 'パ', 'ー', 'ト'] - @test ['¼'] == ['1', '⁄', '4'] - @test ['dž'] == ['d', 'ž'] -end - -#3. Singletons -@test normalize_string("\U212b", :NFD) == "A\U030a" -@test normalize_string("\U212b", :NFC) == "\U00c5" -@test normalize_string("\U2126", :NFC) == normalize_string("\U2126", :NFD) == "\U03a9" - -#4. Canonical Composites -@test normalize_string("\U00c5", :NFC) == "\U00c5" -@test normalize_string("\U00c5", :NFD) == "A\U030a" -@test normalize_string("\U00f4", :NFC) == "\U00f4" -@test normalize_string("\U00f4", :NFD) == "o\U0302" - -#5. Multiple Combining Marks -@test normalize_string("\U1e69", :NFD) == "s\U0323\U0307" -@test normalize_string("\U1e69", :NFC) == "\U1e69" -@test normalize_string("\U1e0b\U0323", :NFD) == "d\U0323\U0307" -@test normalize_string("\U1e0b\U0323", :NFC) == "\U1e0d\U0307" -@test normalize_string("q\U0307\U0323", :NFC) == "q\U0323\U0307" -@test normalize_string("q\U0307\U0323", :NFD) == "q\U0323\U0307" - -#6. Compatibility Composites -@test normalize_string("\Ufb01", :NFD) == normalize_string("\Ufb01", :NFC) == "\Ufb01" -@test normalize_string("\Ufb01", :NFKD) == normalize_string("\Ufb01", :NFKC) == "fi" -@test normalize_string("2\U2075", :NFD) == normalize_string("2\U2075", :NFC) == "2\U2075" -@test normalize_string("2\U2075", :NFKD) == normalize_string("2\U2075", :NFKC) == "25" -@test normalize_string("\U1e9b\U0323", :NFD) == "\U017f\U0323\U0307" -@test normalize_string("\U1e9b\U0323", :NFC) == "\U1e9b\U0323" -@test normalize_string("\U1e9b\U0323", :NFKD) == "s\U0323\U0307" -@test normalize_string("\U1e9b\U0323", :NFKC) == "\U1e69" - -# issue #5870 -@test !ismatch(Regex("aa"), SubString("",1,0)) -@test ismatch(Regex(""), SubString("",1,0)) - -# issue #6027 -let - # make symbol with invalid char - sym = symbol(Char(0xdcdb)) - @test string(sym) == string(Char(0xdcdb)) - @test expand(sym) === sym - res = string(parse(string(Char(0xdcdb)," = 1"),1,raise=false)[1]) - @test res == """\$(Expr(:error, "invalid character \\\"\\udcdb\\\"\"))""" -end - -@test symbol("asdf") === :asdf -@test symbol(:abc,"def",'g',"hi",0) === :abcdefghi0 -@test :a < :b -@test startswith(string(gensym("asdf")),"##asdf#") -@test gensym("asdf") != gensym("asdf") -@test gensym() != gensym() -@test startswith(string(gensym()),"##") -@test_throws ArgumentError symbol("ab\0") -@test_throws ArgumentError gensym("ab\0") - -# issue #6949 -let f =IOBuffer(), - x = split("1 2 3") - @test write(f, x) == 3 - @test takebuf_string(f) == "123" - @test invoke(write, Tuple{IO, AbstractArray}, f, x) == 3 - @test takebuf_string(f) == "123" -end - -# issue #7248 -@test_throws BoundsError ind2chr("hello", -1) -@test_throws BoundsError chr2ind("hello", -1) -@test_throws BoundsError ind2chr("hellø", -1) -@test_throws BoundsError chr2ind("hellø", -1) -@test_throws BoundsError ind2chr("hello", 10) -@test_throws BoundsError chr2ind("hello", 10) -@test_throws BoundsError ind2chr("hellø", 10) -@test_throws BoundsError chr2ind("hellø", 10) -@test_throws BoundsError checkbounds("hello", 0) -@test_throws BoundsError checkbounds("hello", 6) -@test_throws BoundsError checkbounds("hello", 0:3) -@test_throws BoundsError checkbounds("hello", 4:6) -@test_throws BoundsError checkbounds("hello", [0:3;]) -@test_throws BoundsError checkbounds("hello", [4:6;]) -@test checkbounds("hello", 2) -@test checkbounds("hello", 1:5) -@test checkbounds("hello", [1:5;]) - - -# isvalid(), chr2ind() and ind2chr() for SubString{DirectIndexString} -let s="lorem ipsum", - sdict=Dict(SubString(s,1,11)=>s, - SubString(s,1,6)=>"lorem ", - SubString(s,1,0)=>"", - SubString(s,2,4)=>"ore", - SubString(s,2,16)=>"orem ipsum", - SubString(s,12,14)=>"" - ) - for (ss,s) in sdict - for i in -1:12 - @test isvalid(ss,i)==isvalid(s,i) - end - end - for (ss,s) in sdict - for i in 1:length(ss) - @test ind2chr(ss,i)==ind2chr(s,i) - end - end - for (ss,s) in sdict - for i in 1:length(ss) - @test chr2ind(ss,i)==chr2ind(s,i) - end - end -end #let - -#for isvalid(SubString{UTF8String}) -let s = utf8("Σx + βz - 2") - for i in -1:length(s)+2 - ss=SubString(s,1,i) - @test isvalid(ss,i)==isvalid(s,i) - end -end - -ss=SubString("hello",1,5) -@test_throws BoundsError ind2chr(ss, -1) -@test_throws BoundsError chr2ind(ss, -1) -@test_throws BoundsError chr2ind(ss, 10) -@test_throws BoundsError ind2chr(ss, 10) - -# length(SubString{UTF8String}) performance specialization -let s = "|η(α)-ϕ(κ)| < ε" - @test length(SubString(s,1,0))==length(s[1:0]) - @test length(SubString(s,4,4))==length(s[4:4]) - @test length(SubString(s,1,7))==length(s[1:7]) - @test length(SubString(s,4,11))==length(s[4:11]) -end - -# issue #7764 -let - srep = RepString("Σβ",2) - s="Σβ" - ss=SubString(s,1,endof(s)) - - @test ss^2 == "ΣβΣβ" - @test RepString(ss,2) == "ΣβΣβ" - - @test endof(srep) == 7 - - @test next(srep, 3) == ('β',5) - @test next(srep, 7) == ('β',9) - - @test srep[7] == 'β' - @test_throws BoundsError srep[8] -end - -#issue #5939 uft8proc/libmojibake character predicates -let - alower=['a', 'd', 'j', 'y', 'z'] - ulower=['α', 'β', 'γ', 'δ', 'ф', 'я'] - for c in vcat(alower,ulower) - @test islower(c) == true - @test isupper(c) == false - @test isdigit(c) == false - @test isnumber(c) == false - end - - aupper=['A', 'D', 'J', 'Y', 'Z'] - uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'Dž', 'Ж', 'Д'] - - for c in vcat(aupper,uupper) - @test islower(c) == false - @test isupper(c) == true - @test isdigit(c) == false - @test isnumber(c) == false - end - - nocase=['א','ﺵ'] - alphas=vcat(alower,ulower,aupper,uupper,nocase) - - for c in alphas - @test isalpha(c) == true - @test isnumber(c) == false - end - - - anumber=['0', '1', '5', '9'] - unumber=['٣', '٥', '٨', '¹', 'ⅳ' ] - - for c in anumber - @test isdigit(c) == true - @test isnumber(c) == true - end - for c in unumber - @test isdigit(c) == false - @test isnumber(c) == true - end - - alnums=vcat(alphas,anumber,unumber) - for c in alnums - @test isalnum(c) == true - @test ispunct(c) == false - end - - asymbol = ['(',')', '~', '$' ] - usymbol = ['∪', '∩', '⊂', '⊃', '√', '€', '¥', '↰', '△', '§'] - - apunct =['.',',',';',':','&'] - upunct =['‡', '؟', '჻' ] - - for c in vcat(apunct,upunct) - @test ispunct(c) == true - @test isalnum(c) == false - end - - for c in vcat(alnums,asymbol,usymbol,apunct,upunct) - @test isprint(c) == true - @test isgraph(c) == true - @test isspace(c) == false - @test iscntrl(c) == false - end - - NBSP = Char(0x0000A0) - ENSPACE = Char(0x002002) - EMSPACE = Char(0x002003) - THINSPACE = Char(0x002009) - ZWSPACE = Char(0x002060) - - uspace = [ENSPACE, EMSPACE, THINSPACE] - aspace = [' '] - acntrl_space = ['\t', '\n', '\v', '\f', '\r'] - for c in vcat(aspace,uspace) - @test isspace(c) == true - @test isprint(c) == true - @test isgraph(c) == false - end - - for c in vcat(acntrl_space) - @test isspace(c) == true - @test isprint(c) == false - @test isgraph(c) == false - end - - @test isspace(ZWSPACE) == false # zero-width space - - acontrol = [ Char(0x001c), Char(0x001d), Char(0x001e), Char(0x001f)] - latincontrol = [ Char(0x0080), Char(0x0085) ] - ucontrol = [ Char(0x200E), Char(0x202E) ] - - for c in vcat(acontrol, acntrl_space, latincontrol) - @test iscntrl(c) == true - @test isalnum(c) == false - @test isprint(c) == false - @test isgraph(c) == false - end - - for c in ucontrol #non-latin1 controls - if c!=Char(0x0085) - @test iscntrl(c) == false - @test isspace(c) == false - @test isalnum(c) == false - @test isprint(c) == false - @test isgraph(c) == false - end - end - -end - -@test isspace(" \t \n \r ")==true -@test isgraph(" \t \n \r ")==false -@test isprint(" \t \n \r ")==false -@test isalpha(" \t \n \r ")==false -@test isnumber(" \t \n \r ")==false -@test ispunct(" \t \n \r ")==false - -@test isspace("ΣβΣβ")==false -@test isalpha("ΣβΣβ")==true -@test isgraph("ΣβΣβ")==true -@test isprint("ΣβΣβ")==true -@test isupper("ΣβΣβ")==false -@test islower("ΣβΣβ")==false -@test isnumber("ΣβΣβ")==false -@test iscntrl("ΣβΣβ")==false -@test ispunct("ΣβΣβ")==false - -@test isnumber("23435")==true -@test isdigit("23435")==true -@test isalnum("23435")==true -@test isalpha("23435")==false -@test iscntrl( string(Char(0x0080))) == true -@test ispunct( "‡؟჻") ==true - -@test isxdigit('0') == true -@test isxdigit("0") == true -@test isxdigit("a") == true -@test isxdigit("g") == false - -# Issue #11140 -@test isvalid(utf32("a")) == true -@test isvalid(utf32("\x00")) == true -@test isvalid(UTF32String, UInt32[0xd800,0]) == false - -# Issue #11241 - -@test isvalid(ASCIIString, "is_valid_ascii") == true -@test isvalid(ASCIIString, "Σ_not_valid_ascii") == false - -# test all edge conditions -for (val, pass) in ( - (0, true), (0xd7ff, true), - (0xd800, false), (0xdfff, false), - (0xe000, true), (0xffff, true), - (0x10000, true), (0x10ffff, true), - (0x110000, false) - ) - @test isvalid(Char, val) == pass -end -for (val, pass) in ( - (b"\x00", true), - (b"\x7f", true), - (b"\x80", false), - (b"\xbf", false), - (b"\xc0", false), - (b"\xff", false), - (b"\xc0\x80", false), - (b"\xc1\x80", false), - (b"\xc2\x80", true), - (b"\xc2\xc0", false), - (b"\xed\x9f\xbf", true), - (b"\xed\xa0\x80", false), - (b"\xed\xbf\xbf", false), - (b"\xee\x80\x80", true), - (b"\xef\xbf\xbf", true), - (b"\xf0\x90\x80\x80", true), - (b"\xf4\x8f\xbf\xbf", true), - (b"\xf4\x90\x80\x80", false), - (b"\xf5\x80\x80\x80", false), - (b"\ud800\udc00", false), - (b"\udbff\udfff", false), - (b"\ud800\u0100", false), - (b"\udc00\u0100", false), - (b"\udc00\ud800", false) - ) - @test isvalid(UTF8String, val) == pass -end -for (val, pass) in ( - (UInt16[0x0000], true), - (UInt16[0xd7ff,0], true), - (UInt16[0xd800,0], false), - (UInt16[0xdfff,0], false), - (UInt16[0xe000,0], true), - (UInt16[0xffff,0], true), - (UInt16[0xd800,0xdc00,0], true), - (UInt16[0xdbff,0xdfff,0], true), - (UInt16[0xd800,0x0100,0], false), - (UInt16[0xdc00,0x0100,0], false), - (UInt16[0xdc00,0xd800,0], false) - ) - @test isvalid(UTF16String, val) == pass -end -for (val, pass) in ( - (UInt32[0x0000], true), - (UInt32[0xd7ff,0], true), - (UInt32[0xd800,0], false), - (UInt32[0xdfff,0], false), - (UInt32[0xe000,0], true), - (UInt32[0xffff,0], true), - (UInt32[0x100000,0], true), - (UInt32[0x10ffff,0], true), - (UInt32[0x110000,0], false), - ) - @test isvalid(UTF32String, val) == pass -end - -# Issue #11203 -@test isvalid(ASCIIString,UInt8[]) == true -@test isvalid(UTF8String, UInt8[]) == true -@test isvalid(UTF16String,UInt16[]) == true -@test isvalid(UTF32String,UInt32[]) == true - -# Check UTF-8 characters -# Check ASCII range (true), -# then single continuation bytes and lead bytes with no following continuation bytes (false) -for (rng,flg) in ((0:0x7f, true), (0x80:0xff, false)) - for byt in rng - @test isvalid(UTF8String, UInt8[byt]) == flg - end -end -# Check overlong lead bytes for 2-character sequences (false) -for byt = 0xc0:0xc1 - @test isvalid(UTF8String, UInt8[byt,0x80]) == false -end -# Check valid lead-in to two-byte sequences (true) -for byt = 0xc2:0xdf - for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false)) - for cont in rng - @test isvalid(UTF8String, UInt8[byt, cont]) == flg - end - end -end -# Check three-byte sequences -for r1 in (0xe0:0xec, 0xee:0xef) - for byt = r1 - # Check for short sequence - @test isvalid(UTF8String, UInt8[byt]) == false - for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false)) - for cont in rng - @test isvalid(UTF8String, UInt8[byt, cont]) == false - @test isvalid(UTF8String, UInt8[byt, cont, 0x80]) == flg - end - end - end -end -# Check hangul characters (0xd000-0xd7ff) hangul -# Check for short sequence, or start of surrogate pair -for (rng,flg) in ((0x00:0x7f, false), (0x80:0x9f, true), (0xa0:0xff, false)) - for cont in rng - @test isvalid(UTF8String, UInt8[0xed, cont]) == false - @test isvalid(UTF8String, UInt8[0xed, cont, 0x80]) == flg - end -end -# Check valid four-byte sequences -for byt = 0xf0:0xf4 - if (byt == 0xf0) - r0 = ((0x00:0x8f, false), (0x90:0xbf, true), (0xc0:0xff, false)) - elseif byt == 0xf4 - r0 = ((0x00:0x7f, false), (0x80:0x8f, true), (0x90:0xff, false)) - else - r0 = ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false)) - end - for (rng,flg) in r0 - for cont in rng - @test isvalid(UTF8String, UInt8[byt, cont]) == false - @test isvalid(UTF8String, UInt8[byt, cont, 0x80]) == false - @test isvalid(UTF8String, UInt8[byt, cont, 0x80, 0x80]) == flg - end - end -end -# Check five-byte sequences, should be invalid -for byt = 0xf8:0xfb - @test isvalid(UTF8String, UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false -end -# Check six-byte sequences, should be invalid -for byt = 0xfc:0xfd - @test isvalid(UTF8String, UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false -end -# Check seven-byte sequences, should be invalid -@test isvalid(UTF8String, UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false - -# 11482 - -# isvalid -let s = "abcdef", u8 = "abcdef\uff", u16 = utf16(u8), u32 = utf32(u8), - bad32 = utf32(UInt32[65,0x110000]), badch = Char[0x110000][1] - - @test !isvalid(bad32) - @test !isvalid(badch) - @test isvalid(s) - @test isvalid(u8) - @test isvalid(u16) - @test isvalid(u32) - @test isvalid(ASCIIString, s) - @test isvalid(UTF8String, u8) - @test isvalid(UTF16String, u16) - @test isvalid(UTF32String, u32) -end - -# This caused JuliaLang/JSON.jl#82 -@test first('\x00':'\x7f') === '\x00' -@test last('\x00':'\x7f') === '\x7f' - -# Tests of join() -@test join([]) == "" -@test join(["a"],"?") == "a" -@test join("HELLO",'-') == "H-E-L-L-O" -@test join(1:5, ", ", " and ") == "1, 2, 3, 4 and 5" -@test join(["apples", "bananas", "pineapples"], ", ", " and ") == "apples, bananas and pineapples" - -# issue #9178 `join` calls `done()` twice on the iterables -type i9178 - nnext::Int64 - ndone::Int64 -end -Base.start(jt::i9178) = (jt.nnext=0 ; jt.ndone=0 ; 0) -Base.done(jt::i9178, n) = (jt.ndone += 1 ; n > 3) -Base.next(jt::i9178, n) = (jt.nnext += 1 ; ("$(jt.nnext),$(jt.ndone)", n+1)) -@test join(i9178(0,0), ";") == "1,1;2,2;3,3;4,4" - -# make sure substrings handle last code unit even if not start of codepoint -let s = "x\u0302" - @test s[1:3] == s -end - -# reverseind -for T in (ASCIIString, UTF8String, UTF16String, UTF32String) - for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1") - for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4") - for c in ('X', 'δ', '\U0001d6a5') - T != ASCIIString || (isascii(prefix) && isascii(suffix) && isascii(c)) || continue - s = convert(T, string(prefix, c, suffix)) - ri = search(reverse(s), c) - @test reverse(s) == RevString(s) - @test c == s[reverseind(s, ri)] == reverse(s)[ri] - s = RevString(s) - ri = search(reverse(s), c) - @test c == s[reverseind(s, ri)] == reverse(s)[ri] - s = convert(T, string(prefix, prefix, c, suffix, suffix)) - pre = convert(T, prefix) - sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix)))) - ri = search(reverse(sb), c) - @test c == sb[reverseind(sb, ri)] == reverse(sb)[ri] - end - end - end -end - -# issue #9781 -# float(SubString) wasn't tolerant of trailing whitespace, which was different -# to "normal" strings. This also checks we aren't being too tolerant and allowing -# any arbitrary trailing characters. -@test parse(Float64,"1\n") == 1.0 -@test [parse(Float64,x) for x in split("0,1\n",",")][2] == 1.0 -@test_throws ArgumentError parse(Float64,split("0,1 X\n",",")[2]) -@test parse(Float32,"1\n") == 1.0 -@test [parse(Float32,x) for x in split("0,1\n",",")][2] == 1.0 -@test_throws ArgumentError parse(Float32,split("0,1 X\n",",")[2]) - -#more ascii tests -@test convert(ASCIIString, UInt8[32,107,75], "*") == " kK" -@test convert(ASCIIString, UInt8[132,107,75], "*") == "*kK" -@test convert(ASCIIString, UInt8[], "*") == "" -@test convert(ASCIIString, UInt8[255], "*") == "*" - -@test ucfirst("Hola")=="Hola" -@test ucfirst("hola")=="Hola" -@test ucfirst("")=="" -@test ucfirst("*")=="*" - -@test lcfirst("Hola")=="hola" -@test lcfirst("hola")=="hola" -@test lcfirst("")=="" -@test lcfirst("*")=="*" - -#more UTF8String tests -@test convert(UTF8String, UInt8[32,107,75], "*") == " kK" -@test convert(UTF8String, UInt8[132,107,75], "*") == "*kK" -@test convert(UTF8String, UInt8[32,107,75], "αβ") == " kK" -@test convert(UTF8String, UInt8[132,107,75], "αβ") == "αβkK" -@test convert(UTF8String, UInt8[], "*") == "" -@test convert(UTF8String, UInt8[255], "αβ") == "αβ" - -# test AbstractString functions at beginning of string.jl -immutable tstStringType <: AbstractString - data::Array{UInt8,1} -end -tstr = tstStringType("12"); -@test_throws ErrorException endof(tstr) -@test_throws ErrorException next(tstr, Bool(1)) - -gstr = GenericString("12"); -@test typeof(string(gstr))==GenericString -@test bytestring()=="" - -@test convert(Array{UInt8}, gstr) ==[49;50] -@test convert(Array{Char,1}, gstr) ==['1';'2'] -@test convert(Symbol, gstr)==symbol("12") - -@test getindex(gstr, Bool(1))=='1' -@test getindex(gstr,Bool(1):Bool(1))=="1" -@test getindex(gstr,AbstractVector([Bool(1):Bool(1);]))=="1" - -@test symbol(gstr)==symbol("12") - -@test_throws ErrorException sizeof(gstr) - -@test length(GenericString(""))==0 - -@test getindex(gstr,AbstractVector([Bool(1):Bool(1);]))=="1" - -@test nextind(AbstractArray([Bool(1):Bool(1);]),1)==2 - -@test ind2chr(gstr,2)==2 - -# issue #10307 -@test typeof(map(Int16,String[])) == Vector{Int16} - -for T in [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128] - for i in [typemax(T), typemin(T)] - s = "$i" - @test get(tryparse(T, s)) == i - end -end - -for T in [Int8, Int16, Int32, Int64, Int128] - for i in [typemax(T), typemin(T)] - f = "$(i)0" - @test isnull(tryparse(T, f)) - end -end - -# issue #11142 -s = "abcdefghij" -sp = pointer(s) -@test ascii(sp) == s -@test ascii(sp,5) == "abcde" -@test typeof(ascii(sp)) == ASCIIString -@test typeof(utf8(sp)) == UTF8String -s = "abcde\uff\u2000\U1f596" -sp = pointer(s) -@test utf8(sp) == s -@test utf8(sp,5) == "abcde" -@test typeof(utf8(sp)) == UTF8String - -@test get(tryparse(BigInt, "1234567890")) == BigInt(1234567890) -@test isnull(tryparse(BigInt, "1234567890-")) - -@test get(tryparse(Float64, "64")) == 64.0 -@test isnull(tryparse(Float64, "64o")) -@test get(tryparse(Float32, "32")) == 32.0f0 -@test isnull(tryparse(Float32, "32o")) - -# issue #10994: handle embedded NUL chars for string parsing -for T in [BigInt, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128] - @test_throws ArgumentError parse(T, "1\0") -end -for T in [BigInt, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128, Float64, Float32] - @test isnull(tryparse(T, "1\0")) -end -let s = normalize_string("tést",:NFKC) - @test bytestring(Base.unsafe_convert(Cstring, s)) == s - @test bytestring(convert(Cstring, symbol(s))) == s - @test wstring(Base.unsafe_convert(Cwstring, wstring(s))) == s -end -let s = "ba\0d" - @test_throws ArgumentError Base.unsafe_convert(Cstring, s) - @test_throws ArgumentError Base.unsafe_convert(Cwstring, wstring(s)) -end - -# issue # 11389: Vector{UInt32} was copied with UTF32String, unlike Vector{Char} -a = UInt32[48,0] -b = UTF32String(a) -@test b=="0" -a[1] = 65 -@test b=="A" -c = Char['0','\0'] -d = UTF32String(c) -@test d=="0" -c[1] = 'A' -@test d=="A" - -# Issue #11575 -# Test invalid sequences - -byt = 0x0 # Needs to be defined outside the try block! -try - # Continuation byte not after lead - for byt in 0x80:0xbf - @test_throws UnicodeError Base.checkstring(UInt8[byt]) - end - - # Test lead bytes - for byt in 0xc0:0xff - # Single lead byte at end of string - @test_throws UnicodeError Base.checkstring(UInt8[byt]) - # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError Base.checkstring(UInt8[byt,0]) - # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError Base.checkstring(UInt8[byt,0xc0]) - end - - # Test overlong 2-byte - for byt in 0x81:0xbf - @test_throws UnicodeError Base.checkstring(UInt8[0xc0,byt]) - end - for byt in 0x80:0xbf - @test_throws UnicodeError Base.checkstring(UInt8[0xc1,byt]) - end - - # Test overlong 3-byte - for byt in 0x80:0x9f - @test_throws UnicodeError Base.checkstring(UInt8[0xe0,byt,0x80]) - end - - # Test overlong 4-byte - for byt in 0x80:0x8f - @test_throws UnicodeError Base.checkstring(UInt8[0xef,byt,0x80,0x80]) - end - - # Test 4-byte > 0x10ffff - for byt in 0x90:0xbf - @test_throws UnicodeError Base.checkstring(UInt8[0xf4,byt,0x80,0x80]) - end - for byt in 0xf5:0xf7 - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80]) - end - - # Test 5-byte - for byt in 0xf8:0xfb - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80,0x80]) - end - - # Test 6-byte - for byt in 0xfc:0xfd - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80,0x80,0x80]) - end - - # Test 7-byte - @test_throws UnicodeError Base.checkstring(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]) - - # Three and above byte sequences - for byt in 0xe0:0xef - # Lead followed by only 1 continuation byte - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80]) - # Lead ended by non-continuation character < 0x80 - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0]) - # Lead ended by non-continuation character > 0xbf - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0xc0]) - end - - # 3-byte encoded surrogate character(s) - # Single surrogate - @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80]) - # Not followed by surrogate - @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]) - # Trailing surrogate first - @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]) - # Followed by lead surrogate - @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]) - - # Four byte sequences - for byt in 0xf0:0xf4 - # Lead followed by only 2 continuation bytes - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80]) - # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0]) - # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0xc0]) - end -catch exp; - println("Error testing checkstring: $byt, $exp") - throw(exp) -end - -# Surrogates -@test_throws UnicodeError Base.checkstring(UInt16[0xd800]) -@test_throws UnicodeError Base.checkstring(UInt16[0xdc00]) -@test_throws UnicodeError Base.checkstring(UInt16[0xdc00,0xd800]) - -# Surrogates in UTF-32 -@test_throws UnicodeError Base.checkstring(UInt32[0xd800]) -@test_throws UnicodeError Base.checkstring(UInt32[0xdc00]) -@test_throws UnicodeError Base.checkstring(UInt32[0xdc00,0xd800]) - -# Characters > 0x10ffff -@test_throws UnicodeError Base.checkstring(UInt32[0x110000]) - -# Test valid sequences -for (seq, res) in ( - (UInt8[0x0], (1,0,0,0,0)), # Nul byte, beginning of ASCII range - (UInt8[0x7f], (1,0,0,0,0)), # End of ASCII range - (UInt8[0xc0,0x80], (1,1,0,0,0)), # Long encoded Nul byte (Modified UTF-8, Java) - (UInt8[0xc2,0x80], (1,2,0,0,1)), # \u80, beginning of Latin1 range - (UInt8[0xc3,0xbf], (1,2,0,0,1)), # \uff, end of Latin1 range - (UInt8[0xc4,0x80], (1,4,0,0,1)), # \u100, beginning of non-Latin1 2-byte range - (UInt8[0xdf,0xbf], (1,4,0,0,1)), # \u7ff, end of non-Latin1 2-byte range - (UInt8[0xe0,0xa0,0x80], (1,8,0,1,0)), # \u800, beginning of 3-byte range - (UInt8[0xed,0x9f,0xbf], (1,8,0,1,0)), # \ud7ff, end of first part of 3-byte range - (UInt8[0xee,0x80,0x80], (1,8,0,1,0)), # \ue000, beginning of second part of 3-byte range - (UInt8[0xef,0xbf,0xbf], (1,8,0,1,0)), # \uffff, end of 3-byte range - (UInt8[0xf0,0x90,0x80,0x80],(1,16,1,0,0)), # \U10000, beginning of 4-byte range - (UInt8[0xf4,0x8f,0xbf,0xbf],(1,16,1,0,0)), # \U10ffff, end of 4-byte range - (UInt8[0xed,0xa0,0x80,0xed,0xb0,0x80], (1,0x30,1,0,0)), # Overlong \U10000, (CESU-8) - (UInt8[0xed,0xaf,0xbf,0xed,0xbf,0xbf], (1,0x30,1,0,0)), # Overlong \U10ffff, (CESU-8) - (UInt16[0x0000], (1,0,0,0,0)), # Nul byte, beginning of ASCII range - (UInt16[0x007f], (1,0,0,0,0)), # End of ASCII range - (UInt16[0x0080], (1,2,0,0,1)), # Beginning of Latin1 range - (UInt16[0x00ff], (1,2,0,0,1)), # End of Latin1 range - (UInt16[0x0100], (1,4,0,0,1)), # Beginning of non-Latin1 2-byte range - (UInt16[0x07ff], (1,4,0,0,1)), # End of non-Latin1 2-byte range - (UInt16[0x0800], (1,8,0,1,0)), # Beginning of 3-byte range - (UInt16[0xd7ff], (1,8,0,1,0)), # End of first part of 3-byte range - (UInt16[0xe000], (1,8,0,1,0)), # Beginning of second part of 3-byte range - (UInt16[0xffff], (1,8,0,1,0)), # End of 3-byte range - (UInt16[0xd800,0xdc00], (1,16,1,0,0)), # \U10000, beginning of 4-byte range - (UInt16[0xdbff,0xdfff], (1,16,1,0,0)), # \U10ffff, end of 4-byte range - (UInt32[0x0000], (1,0,0,0,0)), # Nul byte, beginning of ASCII range - (UInt32[0x007f], (1,0,0,0,0)), # End of ASCII range - (UInt32[0x0080], (1,2,0,0,1)), # Beginning of Latin1 range - (UInt32[0x00ff], (1,2,0,0,1)), # End of Latin1 range - (UInt32[0x0100], (1,4,0,0,1)), # Beginning of non-Latin1 2-byte range - (UInt32[0x07ff], (1,4,0,0,1)), # End of non-Latin1 2-byte range - (UInt32[0x0800], (1,8,0,1,0)), # Beginning of 3-byte range - (UInt32[0xd7ff], (1,8,0,1,0)), # End of first part of 3-byte range - (UInt32[0xe000], (1,8,0,1,0)), # Beginning of second part of 3-byte range - (UInt32[0xffff], (1,8,0,1,0)), # End of 3-byte range - (UInt32[0x10000], (1,16,1,0,0)), # \U10000, beginning of 4-byte range - (UInt32[0x10ffff], (1,16,1,0,0)), # \U10ffff, end of 4-byte range - (UInt32[0xd800,0xdc00], (1,0x30,1,0,0)),# Overlong \U10000, (CESU-8) - (UInt32[0xdbff,0xdfff], (1,0x30,1,0,0)))# Overlong \U10ffff, (CESU-8) - @test Base.checkstring(seq) == res -end - -# Test bounds checking -@test_throws BoundsError Base.checkstring(b"abcdef", -10) -@test_throws BoundsError Base.checkstring(b"abcdef", 0) -@test_throws BoundsError Base.checkstring(b"abcdef", 7) -@test_throws BoundsError Base.checkstring(b"abcdef", 3, -10) -@test_throws BoundsError Base.checkstring(b"abcdef", 3, 0) -@test_throws BoundsError Base.checkstring(b"abcdef", 3, 7) -@test_throws ArgumentError Base.checkstring(b"abcdef", 3, 1) - -# iteration -@test [c for c in "ḟøøƀäṙ"] == ['ḟ', 'ø', 'ø', 'ƀ', 'ä', 'ṙ'] -@test [i for i in eachindex("ḟøøƀäṙ")] == [1, 4, 6, 8, 10, 12] -@test [x for x in enumerate("ḟøøƀäṙ")] == [(1, 'ḟ'), (2, 'ø'), (3, 'ø'), (4, 'ƀ'), (5, 'ä'), (6, 'ṙ')] - -# issue # 11464: uppercase/lowercase of UTF16String becomes a UTF8String -str = "abcdef\uff\uffff\u10ffffABCDEF" -@test typeof(uppercase("abcdef")) == ASCIIString -@test typeof(uppercase(utf8(str))) == UTF8String -@test typeof(uppercase(utf16(str))) == UTF16String -@test typeof(uppercase(utf32(str))) == UTF32String -@test typeof(lowercase("ABCDEF")) == ASCIIString -@test typeof(lowercase(utf8(str))) == UTF8String -@test typeof(lowercase(utf16(str))) == UTF16String -@test typeof(lowercase(utf32(str))) == UTF32String - -foomap(ch) = (ch > 65) -foobar(ch) = Char(0xd800) -foobaz(ch) = Char(0x200000) -@test_throws UnicodeError map(foomap, utf16(str)) -@test_throws UnicodeError map(foobar, utf16(str)) -@test_throws UnicodeError map(foobaz, utf16(str)) - -# issue #11551 (#11004,#10959) -function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String) - @test utf16(strUTF8) == strUTF16 - @test utf8(strUTF16) == strUTF8 -end - -# Create some ASCII, UTF8 and UTF16 -strAscii = "abcdefgh" -strA_UTF8 = ("abcdefgh\uff")[1:8] -strL_UTF8 = "abcdef\uff\uff" -str2_UTF8 = "abcd\uff\uff\u7ff\u7ff" -str3_UTF8 = "abcd\uff\uff\u7fff\u7fff" -str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff" -strS_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80") -strC_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000") -strZ_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80") -strz_UTF8 = UTF8String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0") - -strA_UTF16 = utf16(strA_UTF8) -strL_UTF16 = utf16(strL_UTF8) -str2_UTF16 = utf16(str2_UTF8) -str3_UTF16 = utf16(str3_UTF8) -str4_UTF16 = utf16(str4_UTF8) -strS_UTF16 = utf16(strS_UTF8) - -@test utf8(strAscii) == strAscii -@test utf16(strAscii) == strAscii - -tstcvt(strA_UTF8,strA_UTF16) -tstcvt(strL_UTF8,strL_UTF16) -tstcvt(str2_UTF8,str2_UTF16) -tstcvt(str3_UTF8,str3_UTF16) -tstcvt(str4_UTF8,str4_UTF16) - -# Test converting surrogate pairs -@test utf16(strS_UTF8) == strC_UTF8 -@test utf8(strS_UTF16) == strC_UTF8 - -# Test converting overlong \0 -# @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl) -@test utf16(strZ_UTF8) == strz_UTF8 - -# Test invalid sequences - -byt = 0x0 -for T in (UTF16String,) # UTF32String - try - # Continuation byte not after lead - for byt in 0x80:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt])) - end - - # Test lead bytes - for byt in 0xc0:0xff - # Single lead byte at end of string - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt])) - # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0])) - # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0xc0])) - end - - # Test overlong 2-byte - for byt in 0x81:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc0,byt])) - end - for byt in 0x80:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xc1,byt])) - end - - # Test overlong 3-byte - for byt in 0x80:0x9f - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xe0,byt,0x80])) - end - - # Test overlong 4-byte - for byt in 0x80:0x8f - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xef,byt,0x80,0x80])) - end - - # Test 4-byte > 0x10ffff - for byt in 0x90:0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xf4,byt,0x80,0x80])) - end - for byt in 0xf5:0xf7 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80])) - end - - # Test 5-byte - for byt in 0xf8:0xfb - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80])) - end - - # Test 6-byte - for byt in 0xfc:0xfd - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0x80,0x80,0x80])) - end - - # Test 7-byte - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])) - - # Three and above byte sequences - for byt in 0xe0:0xef - # Lead followed by only 1 continuation byte - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80])) - # Lead ended by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0])) - # Lead ended by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0xc0])) - end - - # 3-byte encoded surrogate character(s) - # Single surrogate - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80])) - # Not followed by surrogate - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])) - # Trailing surrogate first - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])) - # Followed by lead surrogate - @test_throws UnicodeError convert(T, UTF8String(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])) - - # Four byte sequences - for byt in 0xf0:0xf4 - # Lead followed by only 2 continuation bytes - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80])) - # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0])) - # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, UTF8String(UInt8[byt,0x80,0x80,0xc0])) - end - catch exp ; - println("Error checking $T: $byt") - throw(exp) - end -end diff --git a/test/strings/basic.jl b/test/strings/basic.jl new file mode 100644 index 0000000000000..25b879c34e27f --- /dev/null +++ b/test/strings/basic.jl @@ -0,0 +1,465 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# {starts,ends}with +@test startswith("abcd", 'a') +@test startswith("abcd", "a") +@test startswith("abcd", "ab") +@test !startswith("ab", "abcd") +@test !startswith("abcd", "bc") +@test endswith("abcd", 'd') +@test endswith("abcd", "d") +@test endswith("abcd", "cd") +@test !endswith("abcd", "dc") +@test !endswith("cd", "abcd") + +@test filter(x -> x ∈ ['f', 'o'], "foobar") == "foo" + +# string iteration, and issue #1454 +str = "é" +str_a = vcat(str...) +@test length(str_a)==1 +@test str_a[1] == str[1] + +str = "s\u2200" +@test str[1:end] == str + +# sizeof +@test sizeof("abc") == 3 +@test sizeof("\u2222") == 3 + +# issue #3597 +@test string(utf32(['T', 'e', 's', 't'])[1:1], "X") == "TX" + +for T = (UInt8,Int8,UInt16,Int16,UInt32,Int32,UInt64,Int64,UInt128,Int128,BigInt), + b = 2:62, _ = 1:10 + n = T != BigInt ? rand(T) : BigInt(rand(Int128)) + @test parse(T,base(b,n),b) == n +end + +# issue #6027 +let + # make symbol with invalid char + sym = symbol(Char(0xdcdb)) + @test string(sym) == string(Char(0xdcdb)) + @test expand(sym) === sym + res = string(parse(string(Char(0xdcdb)," = 1"),1,raise=false)[1]) + @test res == """\$(Expr(:error, "invalid character \\\"\\udcdb\\\"\"))""" +end + +@test symbol("asdf") === :asdf +@test symbol(:abc,"def",'g',"hi",0) === :abcdefghi0 +@test :a < :b +@test startswith(string(gensym("asdf")),"##asdf#") +@test gensym("asdf") != gensym("asdf") +@test gensym() != gensym() +@test startswith(string(gensym()),"##") +@test_throws ArgumentError symbol("ab\0") +@test_throws ArgumentError gensym("ab\0") + +# issue #6949 +let f =IOBuffer(), + x = split("1 2 3") + @test write(f, x) == 3 + @test takebuf_string(f) == "123" + @test invoke(write, Tuple{IO, AbstractArray}, f, x) == 3 + @test takebuf_string(f) == "123" +end + +# issue #7248 +@test_throws BoundsError ind2chr("hello", -1) +@test_throws BoundsError chr2ind("hello", -1) +@test_throws BoundsError ind2chr("hellø", -1) +@test_throws BoundsError chr2ind("hellø", -1) +@test_throws BoundsError ind2chr("hello", 10) +@test_throws BoundsError chr2ind("hello", 10) +@test_throws BoundsError ind2chr("hellø", 10) +@test_throws BoundsError chr2ind("hellø", 10) +@test_throws BoundsError checkbounds("hello", 0) +@test_throws BoundsError checkbounds("hello", 6) +@test_throws BoundsError checkbounds("hello", 0:3) +@test_throws BoundsError checkbounds("hello", 4:6) +@test_throws BoundsError checkbounds("hello", [0:3;]) +@test_throws BoundsError checkbounds("hello", [4:6;]) +@test checkbounds("hello", 2) +@test checkbounds("hello", 1:5) +@test checkbounds("hello", [1:5;]) + +#= +# issue #7764 +let + srep = repeat("Σβ",2) + s="Σβ" + ss=SubString(s,1,endof(s)) + + @test repeat(ss,2) == "ΣβΣβ" + + @test endof(srep) == 7 + + @test next(srep, 3) == ('β',5) + @test next(srep, 7) == ('β',9) + + @test srep[7] == 'β' + @test_throws BoundsError srep[8] +end +=# + +# This caused JuliaLang/JSON.jl#82 +@test first('\x00':'\x7f') === '\x00' +@test last('\x00':'\x7f') === '\x7f' + +# make sure substrings handle last code unit even if not start of codepoint +let s = "x\u0302" + @test s[1:3] == s +end + +# issue #9781 +# float(SubString) wasn't tolerant of trailing whitespace, which was different +# to "normal" strings. This also checks we aren't being too tolerant and allowing +# any arbitrary trailing characters. +@test parse(Float64,"1\n") == 1.0 +@test [parse(Float64,x) for x in split("0,1\n",",")][2] == 1.0 +@test_throws ArgumentError parse(Float64,split("0,1 X\n",",")[2]) +@test parse(Float32,"1\n") == 1.0 +@test [parse(Float32,x) for x in split("0,1\n",",")][2] == 1.0 +@test_throws ArgumentError parse(Float32,split("0,1 X\n",",")[2]) + +#more ascii tests +@test convert(ASCIIString, UInt8[32,107,75], "*") == " kK" +@test convert(ASCIIString, UInt8[132,107,75], "*") == "*kK" +@test convert(ASCIIString, UInt8[], "*") == "" +@test convert(ASCIIString, UInt8[255], "*") == "*" + +@test ucfirst("Hola")=="Hola" +@test ucfirst("hola")=="Hola" +@test ucfirst("")=="" +@test ucfirst("*")=="*" + +@test lcfirst("Hola")=="hola" +@test lcfirst("hola")=="hola" +@test lcfirst("")=="" +@test lcfirst("*")=="*" + +#more UTF8String tests +@test convert(UTF8String, UInt8[32,107,75], "*") == " kK" +@test convert(UTF8String, UInt8[132,107,75], "*") == "*kK" +@test convert(UTF8String, UInt8[32,107,75], "αβ") == " kK" +@test convert(UTF8String, UInt8[132,107,75], "αβ") == "αβkK" +@test convert(UTF8String, UInt8[], "*") == "" +@test convert(UTF8String, UInt8[255], "αβ") == "αβ" + +# test AbstractString functions at beginning of string.jl +immutable tstStringType <: AbstractString + data::Array{UInt8,1} +end +tstr = tstStringType("12"); +@test_throws ErrorException endof(tstr) +@test_throws ErrorException next(tstr, Bool(1)) + +## generic string uses only endof and next ## + +immutable GenericString <: AbstractString + string::AbstractString +end + +Base.endof(s::GenericString) = endof(s.string) +Base.next(s::GenericString, i::Int) = next(s.string, i) + +gstr = GenericString("12"); +@test typeof(string(gstr))==GenericString +@test bytestring()=="" + +@test convert(Array{UInt8}, gstr) ==[49;50] +@test convert(Array{Char,1}, gstr) ==['1';'2'] +@test convert(Symbol, gstr)==symbol("12") + +@test getindex(gstr, Bool(1))=='1' +@test getindex(gstr,Bool(1):Bool(1))=="1" +@test getindex(gstr,AbstractVector([Bool(1):Bool(1);]))=="1" + +@test symbol(gstr)==symbol("12") + +@test_throws ErrorException sizeof(gstr) + +@test length(GenericString(""))==0 + +@test getindex(gstr,AbstractVector([Bool(1):Bool(1);]))=="1" + +@test nextind(AbstractArray([Bool(1):Bool(1);]),1)==2 + +@test ind2chr(gstr,2)==2 + +# issue #10307 +@test typeof(map(Int16,String[])) == Vector{Int16} + +for T in [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128] + for i in [typemax(T), typemin(T)] + s = "$i" + @test get(tryparse(T, s)) == i + end +end + +for T in [Int8, Int16, Int32, Int64, Int128] + for i in [typemax(T), typemin(T)] + f = "$(i)0" + @test isnull(tryparse(T, f)) + end +end + +# issue #11142 +s = "abcdefghij" +sp = pointer(s) +@test ascii(sp) == s +@test ascii(sp,5) == "abcde" +@test typeof(ascii(sp)) == ASCIIString +@test typeof(utf8(sp)) == UTF8String +s = "abcde\uff\u2000\U1f596" +sp = pointer(s) +@test utf8(sp) == s +@test utf8(sp,5) == "abcde" +@test typeof(utf8(sp)) == UTF8String + +@test get(tryparse(BigInt, "1234567890")) == BigInt(1234567890) +@test isnull(tryparse(BigInt, "1234567890-")) + +@test get(tryparse(Float64, "64")) == 64.0 +@test isnull(tryparse(Float64, "64o")) +@test get(tryparse(Float32, "32")) == 32.0f0 +@test isnull(tryparse(Float32, "32o")) + +# issue #10994: handle embedded NUL chars for string parsing +for T in [BigInt, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128] + @test_throws ArgumentError parse(T, "1\0") +end +for T in [BigInt, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128, Float64, Float32] + @test isnull(tryparse(T, "1\0")) +end +let s = normalize_string("tést",:NFKC) + @test bytestring(Base.unsafe_convert(Cstring, s)) == s + @test bytestring(convert(Cstring, symbol(s))) == s + @test wstring(Base.unsafe_convert(Cwstring, wstring(s))) == s +end +let s = "ba\0d" + @test_throws ArgumentError Base.unsafe_convert(Cstring, s) + @test_throws ArgumentError Base.unsafe_convert(Cwstring, wstring(s)) +end + +# issue # 11389: Vector{UInt32} was copied with UTF32String, unlike Vector{Char} +a = UInt32[48,0] +b = UTF32String(a) +@test b=="0" +a[1] = 65 +@test b=="A" +c = Char['0','\0'] +d = UTF32String(c) +@test d=="0" +c[1] = 'A' +@test d=="A" + +# iteration +@test [c for c in "ḟøøƀäṙ"] == ['ḟ', 'ø', 'ø', 'ƀ', 'ä', 'ṙ'] +@test [i for i in eachindex("ḟøøƀäṙ")] == [1, 4, 6, 8, 10, 12] +@test [x for x in enumerate("ḟøøƀäṙ")] == [(1, 'ḟ'), (2, 'ø'), (3, 'ø'), (4, 'ƀ'), (5, 'ä'), (6, 'ṙ')] + +# Issue #11140 +@test isvalid(utf32("a")) == true +@test isvalid(utf32("\x00")) == true +@test isvalid(UTF32String, UInt32[0xd800,0]) == false + +# Issue #11241 + +@test isvalid(ASCIIString, "is_valid_ascii") == true +@test isvalid(ASCIIString, "Σ_not_valid_ascii") == false + +# test all edge conditions +for (val, pass) in ( + (0, true), (0xd7ff, true), + (0xd800, false), (0xdfff, false), + (0xe000, true), (0xffff, true), + (0x10000, true), (0x10ffff, true), + (0x110000, false) + ) + @test isvalid(Char, val) == pass +end +for (val, pass) in ( + (b"\x00", true), + (b"\x7f", true), + (b"\x80", false), + (b"\xbf", false), + (b"\xc0", false), + (b"\xff", false), + (b"\xc0\x80", false), + (b"\xc1\x80", false), + (b"\xc2\x80", true), + (b"\xc2\xc0", false), + (b"\xed\x9f\xbf", true), + (b"\xed\xa0\x80", false), + (b"\xed\xbf\xbf", false), + (b"\xee\x80\x80", true), + (b"\xef\xbf\xbf", true), + (b"\xf0\x90\x80\x80", true), + (b"\xf4\x8f\xbf\xbf", true), + (b"\xf4\x90\x80\x80", false), + (b"\xf5\x80\x80\x80", false), + (b"\ud800\udc00", false), + (b"\udbff\udfff", false), + (b"\ud800\u0100", false), + (b"\udc00\u0100", false), + (b"\udc00\ud800", false) + ) + @test isvalid(UTF8String, val) == pass +end +for (val, pass) in ( + (UInt16[0x0000], true), + (UInt16[0xd7ff,0], true), + (UInt16[0xd800,0], false), + (UInt16[0xdfff,0], false), + (UInt16[0xe000,0], true), + (UInt16[0xffff,0], true), + (UInt16[0xd800,0xdc00,0], true), + (UInt16[0xdbff,0xdfff,0], true), + (UInt16[0xd800,0x0100,0], false), + (UInt16[0xdc00,0x0100,0], false), + (UInt16[0xdc00,0xd800,0], false) + ) + @test isvalid(UTF16String, val) == pass +end +for (val, pass) in ( + (UInt32[0x0000], true), + (UInt32[0xd7ff,0], true), + (UInt32[0xd800,0], false), + (UInt32[0xdfff,0], false), + (UInt32[0xe000,0], true), + (UInt32[0xffff,0], true), + (UInt32[0x100000,0], true), + (UInt32[0x10ffff,0], true), + (UInt32[0x110000,0], false), + ) + @test isvalid(UTF32String, val) == pass +end + +# Issue #11203 +@test isvalid(ASCIIString,UInt8[]) == true +@test isvalid(UTF8String, UInt8[]) == true +@test isvalid(UTF16String,UInt16[]) == true +@test isvalid(UTF32String,UInt32[]) == true + +# Check UTF-8 characters +# Check ASCII range (true), +# then single continuation bytes and lead bytes with no following continuation bytes (false) +for (rng,flg) in ((0:0x7f, true), (0x80:0xff, false)) + for byt in rng + @test isvalid(UTF8String, UInt8[byt]) == flg + end +end +# Check overlong lead bytes for 2-character sequences (false) +for byt = 0xc0:0xc1 + @test isvalid(UTF8String, UInt8[byt,0x80]) == false +end +# Check valid lead-in to two-byte sequences (true) +for byt = 0xc2:0xdf + for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false)) + for cont in rng + @test isvalid(UTF8String, UInt8[byt, cont]) == flg + end + end +end +# Check three-byte sequences +for r1 in (0xe0:0xec, 0xee:0xef) + for byt = r1 + # Check for short sequence + @test isvalid(UTF8String, UInt8[byt]) == false + for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false)) + for cont in rng + @test isvalid(UTF8String, UInt8[byt, cont]) == false + @test isvalid(UTF8String, UInt8[byt, cont, 0x80]) == flg + end + end + end +end +# Check hangul characters (0xd000-0xd7ff) hangul +# Check for short sequence, or start of surrogate pair +for (rng,flg) in ((0x00:0x7f, false), (0x80:0x9f, true), (0xa0:0xff, false)) + for cont in rng + @test isvalid(UTF8String, UInt8[0xed, cont]) == false + @test isvalid(UTF8String, UInt8[0xed, cont, 0x80]) == flg + end +end +# Check valid four-byte sequences +for byt = 0xf0:0xf4 + if (byt == 0xf0) + r0 = ((0x00:0x8f, false), (0x90:0xbf, true), (0xc0:0xff, false)) + elseif byt == 0xf4 + r0 = ((0x00:0x7f, false), (0x80:0x8f, true), (0x90:0xff, false)) + else + r0 = ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false)) + end + for (rng,flg) in r0 + for cont in rng + @test isvalid(UTF8String, UInt8[byt, cont]) == false + @test isvalid(UTF8String, UInt8[byt, cont, 0x80]) == false + @test isvalid(UTF8String, UInt8[byt, cont, 0x80, 0x80]) == flg + end + end +end +# Check five-byte sequences, should be invalid +for byt = 0xf8:0xfb + @test isvalid(UTF8String, UInt8[byt, 0x80, 0x80, 0x80, 0x80]) == false +end +# Check six-byte sequences, should be invalid +for byt = 0xfc:0xfd + @test isvalid(UTF8String, UInt8[byt, 0x80, 0x80, 0x80, 0x80, 0x80]) == false +end +# Check seven-byte sequences, should be invalid +@test isvalid(UTF8String, UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false + +# 11482 + +# isvalid +let s = "abcdef", u8 = "abcdef\uff", u16 = utf16(u8), u32 = utf32(u8), + bad32 = utf32(UInt32[65,0x110000]), badch = Char[0x110000][1] + + @test !isvalid(bad32) + @test !isvalid(badch) + @test isvalid(s) + @test isvalid(u8) + @test isvalid(u16) + @test isvalid(u32) + @test isvalid(ASCIIString, s) + @test isvalid(UTF8String, u8) + @test isvalid(UTF16String, u16) + @test isvalid(UTF32String, u32) +end + +# lower and upper +@test uppercase("aBc") == "ABC" +@test uppercase('A') == 'A' +@test uppercase('a') == 'A' +@test lowercase("AbC") == "abc" +@test lowercase('A') == 'a' +@test lowercase('a') == 'a' +@test uppercase('α') == '\u0391' +@test lowercase('Δ') == 'δ' +@test lowercase('\U118bf') == '\U118df' +@test uppercase('\U1044d') == '\U10425' +@test ucfirst("Abc") == "Abc" +@test ucfirst("abc") == "Abc" +@test lcfirst("ABC") == "aBC" +@test lcfirst("aBC") == "aBC" + +# issue # 11464: uppercase/lowercase of UTF16String becomes a UTF8String +str = "abcdef\uff\uffff\u10ffffABCDEF" +@test typeof(uppercase("abcdef")) == ASCIIString +@test typeof(uppercase(utf8(str))) == UTF8String +@test typeof(uppercase(utf16(str))) == UTF16String +@test typeof(uppercase(utf32(str))) == UTF32String +@test typeof(lowercase("ABCDEF")) == ASCIIString +@test typeof(lowercase(utf8(str))) == UTF8String +@test typeof(lowercase(utf16(str))) == UTF16String +@test typeof(lowercase(utf32(str))) == UTF32String + +foomap(ch) = (ch > 65) +foobar(ch) = Char(0xd800) +foobaz(ch) = Char(0x200000) +@test_throws UnicodeError map(foomap, utf16(str)) +@test_throws UnicodeError map(foobar, utf16(str)) +@test_throws UnicodeError map(foobaz, utf16(str)) diff --git a/test/strings/io.jl b/test/strings/io.jl new file mode 100644 index 0000000000000..92717a337f815 --- /dev/null +++ b/test/strings/io.jl @@ -0,0 +1,225 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# string escaping & unescaping +cx = Any[ + 0x00000000 '\0' "\\0" + 0x00000001 '\x01' "\\x01" + 0x00000006 '\x06' "\\x06" + 0x00000007 '\a' "\\a" + 0x00000008 '\b' "\\b" + 0x00000009 '\t' "\\t" + 0x0000000a '\n' "\\n" + 0x0000000b '\v' "\\v" + 0x0000000c '\f' "\\f" + 0x0000000d '\r' "\\r" + 0x0000000e '\x0e' "\\x0e" + 0x0000001a '\x1a' "\\x1a" + 0x0000001b '\e' "\\e" + 0x0000001c '\x1c' "\\x1c" + 0x0000001f '\x1f' "\\x1f" + 0x00000020 ' ' " " + 0x0000002f '/' "/" + 0x00000030 '0' "0" + 0x00000039 '9' "9" + 0x0000003a ':' ":" + 0x00000040 '@' "@" + 0x00000041 'A' "A" + 0x0000005a 'Z' "Z" + 0x0000005b '[' "[" + 0x00000060 '`' "`" + 0x00000061 'a' "a" + 0x0000007a 'z' "z" + 0x0000007b '{' "{" + 0x0000007e '~' "~" + 0x0000007f '\x7f' "\\x7f" + 0x000000bf '\ubf' "\\ubf" + 0x000000ff '\uff' "\\uff" + 0x00000100 '\u100' "\\u100" + 0x000001ff '\u1ff' "\\u1ff" + 0x00000fff '\ufff' "\\ufff" + 0x00001000 '\u1000' "\\u1000" + 0x00001fff '\u1fff' "\\u1fff" + 0x0000ffff '\uffff' "\\uffff" + 0x00010000 '\U10000' "\\U10000" + 0x0001ffff '\U1ffff' "\\U1ffff" + 0x0002ffff '\U2ffff' "\\U2ffff" + 0x00030000 '\U30000' "\\U30000" + 0x000dffff '\Udffff' "\\Udffff" + 0x000e0000 '\Ue0000' "\\Ue0000" + 0x000effff '\Ueffff' "\\Ueffff" + 0x000f0000 '\Uf0000' "\\Uf0000" + 0x000fffff '\Ufffff' "\\Ufffff" + 0x00100000 '\U100000' "\\U100000" + 0x0010ffff '\U10ffff' "\\U10ffff" +] + +for i = 1:size(cx,1) + @test cx[i,1] == convert(UInt32, cx[i,2]) + @test string(cx[i,2]) == unescape_string(cx[i,3]) + if isascii(cx[i,2]) || !isprint(cx[i,2]) + @test cx[i,3] == escape_string(string(cx[i,2])) + end + for j = 1:size(cx,1) + str = string(cx[i,2], cx[j,2]) + @test str == unescape_string(escape_string(str)) + end +end + +for i = 0:0x7f, p = ["","\0","x","xxx","\x7f","\uFF","\uFFF", + "\uFFFF","\U10000","\U10FFF","\U10FFFF"] + c = Char(i) + cp = string(c,p) + op = string(Char(div(i,8)), oct(i%8), p) + hp = string(Char(div(i,16)), hex(i%16), p) + @test string(unescape_string(string("\\",oct(i,1),p))) == cp + @test string(unescape_string(string("\\",oct(i,2),p))) == cp + @test string(unescape_string(string("\\",oct(i,3),p))) == cp + @test string(unescape_string(string("\\",oct(i,4),p))) == op + @test string(unescape_string(string("\\x",hex(i,1),p))) == cp + @test string(unescape_string(string("\\x",hex(i,2),p))) == cp + @test string(unescape_string(string("\\x",hex(i,3),p))) == hp +end + +@test "\z" == unescape_string("\z") == "z" +@test "\X" == unescape_string("\X") == "X" +@test "\AbC" == unescape_string("\AbC") == "AbC" + +@test "\0" == unescape_string("\\0") +@test "\1" == unescape_string("\\1") +@test "\7" == unescape_string("\\7") +@test "\0x" == unescape_string("\\0x") +@test "\1x" == unescape_string("\\1x") +@test "\7x" == unescape_string("\\7x") +@test "\00" == unescape_string("\\00") +@test "\01" == unescape_string("\\01") +@test "\07" == unescape_string("\\07") +@test "\70" == unescape_string("\\70") +@test "\71" == unescape_string("\\71") +@test "\77" == unescape_string("\\77") +@test "\00x" == unescape_string("\\00x") +@test "\01x" == unescape_string("\\01x") +@test "\07x" == unescape_string("\\07x") +@test "\70x" == unescape_string("\\70x") +@test "\71x" == unescape_string("\\71x") +@test "\77x" == unescape_string("\\77x") +@test "\000" == unescape_string("\\000") +@test "\001" == unescape_string("\\001") +@test "\007" == unescape_string("\\007") +@test "\070" == unescape_string("\\070") +@test "\071" == unescape_string("\\071") +@test "\077" == unescape_string("\\077") +@test "\170" == unescape_string("\\170") +@test "\171" == unescape_string("\\171") +@test "\177" == unescape_string("\\177") +@test "\0001" == unescape_string("\\0001") +@test "\0011" == unescape_string("\\0011") +@test "\0071" == unescape_string("\\0071") +@test "\0701" == unescape_string("\\0701") +@test "\0711" == unescape_string("\\0711") +@test "\0771" == unescape_string("\\0771") +@test "\1701" == unescape_string("\\1701") +@test "\1711" == unescape_string("\\1711") +@test "\1771" == unescape_string("\\1771") + +@test "\x0" == unescape_string("\\x0") +@test "\x1" == unescape_string("\\x1") +@test "\xf" == unescape_string("\\xf") +@test "\xF" == unescape_string("\\xF") +@test "\x0x" == unescape_string("\\x0x") +@test "\x1x" == unescape_string("\\x1x") +@test "\xfx" == unescape_string("\\xfx") +@test "\xFx" == unescape_string("\\xFx") +@test "\x00" == unescape_string("\\x00") +@test "\x01" == unescape_string("\\x01") +@test "\x0f" == unescape_string("\\x0f") +@test "\x0F" == unescape_string("\\x0F") + +if !success(`iconv --version`) + warn("iconv not found, skipping unicode tests!") + @windows_only warn("Use WinRPM.install(\"win_iconv\") to run these tests") +else + # Create unicode test data directory + unicodedir = mktempdir() + + # Use perl to generate the primary data + primary_encoding = "UTF-32BE" + primary_path = replace(joinpath(unicodedir, primary_encoding*".unicode"),"\\","\\\\\\\\") + run(`perl -e " + $$fname = \"$primary_path\"; + open(UNICODEF, \">\", \"$$fname\") or die \"can\'t open $$fname: $$!\"; + binmode(UNICODEF); + print UNICODEF pack \"N*\", 0xfeff, 0..0xd7ff, 0xe000..0x10ffff; + close(UNICODEF);"` ) + + # Use iconv to generate the other data + for encoding in ["UTF-32LE", "UTF-16BE", "UTF-16LE", "UTF-8"] + output_path = joinpath(unicodedir, encoding*".unicode") + f = Base.FS.open(output_path,Base.JL_O_WRONLY|Base.JL_O_CREAT,Base.S_IRUSR | Base.S_IWUSR | Base.S_IRGRP | Base.S_IROTH) + run(pipe(`iconv -f $primary_encoding -t $encoding $primary_path`, f)) + Base.FS.close(f) + end + + f=open(joinpath(unicodedir,"UTF-32LE.unicode")) + str1 = utf32(read(f, UInt32, 1112065)[2:end]) + close(f) + + f=open(joinpath(unicodedir,"UTF-8.unicode")) + str2 = UTF8String(read(f, UInt8, 4382595)[4:end]) + close(f) + @test str1 == str2 + + @test str1 == open(joinpath(unicodedir,"UTF-16LE.unicode")) do f + utf16(read(f, UInt16, 2160641)[2:end]) + end + + @test str1 == open(joinpath(unicodedir,"UTF-16LE.unicode")) do f + utf16(read(f, UInt8, 2160641*2)) + end + @test str1 == open(joinpath(unicodedir,"UTF-16BE.unicode")) do f + utf16(read(f, UInt8, 2160641*2)) + end + + @test str1 == open(joinpath(unicodedir,"UTF-32LE.unicode")) do f + utf32(read(f, UInt8, 1112065*4)) + end + @test str1 == open(joinpath(unicodedir,"UTF-32BE.unicode")) do f + utf32(read(f, UInt8, 1112065*4)) + end + + str1 = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" + str2 = UTF32String(UInt32[ + 8704, 32, 949, 32, 62, 32, 48, 44, 32, 8707, 32, + 948, 32, 62, 32, 48, 58, 32, 124, 120, 45, 121, 124, + 32, 60, 32, 948, 32, 8658, 32, 124, 102, 40, 120, + 41, 45, 102, 40, 121, 41, 124, 32, 60, 32, 949 + ,0]) + @test str1 == str2 + + # Cleanup unicode data + for encoding in ["UTF-32BE", "UTF-32LE", "UTF-16BE", "UTF-16LE", "UTF-8"] + rm(joinpath(unicodedir,encoding*".unicode")) + end + rm(unicodedir) +end + +# Tests of join() +@test join([]) == "" +@test join(["a"],"?") == "a" +@test join("HELLO",'-') == "H-E-L-L-O" +@test join(1:5, ", ", " and ") == "1, 2, 3, 4 and 5" +@test join(["apples", "bananas", "pineapples"], ", ", " and ") == "apples, bananas and pineapples" + +# issue #9178 `join` calls `done()` twice on the iterables +type i9178 + nnext::Int64 + ndone::Int64 +end +Base.start(jt::i9178) = (jt.nnext=0 ; jt.ndone=0 ; 0) +Base.done(jt::i9178, n) = (jt.ndone += 1 ; n > 3) +Base.next(jt::i9178, n) = (jt.nnext += 1 ; ("$(jt.nnext),$(jt.ndone)", n+1)) +@test join(i9178(0,0), ";") == "1,1;2,2;3,3;4,4" + +# quotes + interpolation (issue #455) +@test "$("string")" == "string" +arr = ["a","b","c"] +@test "[$(join(arr, " - "))]" == "[a - b - c]" diff --git a/test/strings/search.jl b/test/strings/search.jl new file mode 100644 index 0000000000000..524322615b836 --- /dev/null +++ b/test/strings/search.jl @@ -0,0 +1,352 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# some test strings +astr = "Hello, world.\n" +u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" + +## generic string uses only endof and next ## + +immutable GenericString <: AbstractString + string::AbstractString +end + +Base.endof(s::GenericString) = endof(s.string) +Base.next(s::GenericString, i::Int) = next(s.string, i) + +# ascii search +for str in [astr, GenericString(astr)] + @test_throws BoundsError search(str, 'z', 0) + @test_throws BoundsError search(str, '∀', 0) + @test search(str, 'x') == 0 + @test search(str, '\0') == 0 + @test search(str, '\u80') == 0 + @test search(str, '∀') == 0 + @test search(str, 'H') == 1 + @test search(str, 'l') == 3 + @test search(str, 'l', 4) == 4 + @test search(str, 'l', 5) == 11 + @test search(str, 'l', 12) == 0 + @test search(str, ',') == 6 + @test search(str, ',', 7) == 0 + @test search(str, '\n') == 14 + @test search(str, '\n', 15) == 0 + @test_throws BoundsError search(str, 'ε', nextind(str,endof(str))+1) + @test_throws BoundsError search(str, 'a', nextind(str,endof(str))+1) +end + +# ascii rsearch +for str in [astr] + @test rsearch(str, 'x') == 0 + @test rsearch(str, '\0') == 0 + @test rsearch(str, '\u80') == 0 + @test rsearch(str, '∀') == 0 + @test rsearch(str, 'H') == 1 + @test rsearch(str, 'H', 0) == 0 + @test rsearch(str, 'l') == 11 + @test rsearch(str, 'l', 5) == 4 + @test rsearch(str, 'l', 4) == 4 + @test rsearch(str, 'l', 3) == 3 + @test rsearch(str, 'l', 2) == 0 + @test rsearch(str, ',') == 6 + @test rsearch(str, ',', 5) == 0 + @test rsearch(str, '\n') == 14 +end + +# utf-8 search +for str in (u8str, GenericString(u8str)) + @test_throws BoundsError search(str, 'z', 0) + @test_throws BoundsError search(str, '∀', 0) + @test search(str, 'z') == 0 + @test search(str, '\0') == 0 + @test search(str, '\u80') == 0 + @test search(str, '∄') == 0 + @test search(str, '∀') == 1 + @test_throws UnicodeError search(str, '∀', 2) + @test search(str, '∀', 4) == 0 + @test search(str, '∃') == 13 + @test_throws UnicodeError search(str, '∃', 15) + @test search(str, '∃', 16) == 0 + @test search(str, 'x') == 26 + @test search(str, 'x', 27) == 43 + @test search(str, 'x', 44) == 0 + @test search(str, 'δ') == 17 + @test_throws UnicodeError search(str, 'δ', 18) + @test search(str, 'δ', nextind(str,17)) == 33 + @test search(str, 'δ', nextind(str,33)) == 0 + @test search(str, 'ε') == 5 + @test search(str, 'ε', nextind(str,5)) == 54 + @test search(str, 'ε', nextind(str,54)) == 0 + @test search(str, 'ε', nextind(str,endof(str))) == 0 + @test search(str, 'a', nextind(str,endof(str))) == 0 + @test_throws BoundsError search(str, 'ε', nextind(str,endof(str))+1) + @test_throws BoundsError search(str, 'a', nextind(str,endof(str))+1) +end + +# utf-8 rsearch +for str in [u8str] + @test rsearch(str, 'z') == 0 + @test rsearch(str, '\0') == 0 + @test rsearch(str, '\u80') == 0 + @test rsearch(str, '∄') == 0 + @test rsearch(str, '∀') == 1 + @test rsearch(str, '∀', 0) == 0 + @test rsearch(str, '∃') == 13 + @test rsearch(str, '∃', 14) == 13 + @test rsearch(str, '∃', 13) == 13 + @test rsearch(str, '∃', 12) == 0 + @test rsearch(str, 'x') == 43 + @test rsearch(str, 'x', 42) == 26 + @test rsearch(str, 'x', 25) == 0 + @test rsearch(str, 'δ') == 33 + @test rsearch(str, 'δ', 32) == 17 + @test rsearch(str, 'δ', 16) == 0 + @test rsearch(str, 'ε') == 54 + @test rsearch(str, 'ε', 53) == 5 + @test rsearch(str, 'ε', 4) == 0 +end + +# string search with a single-char string +@test search(astr, "x") == 0:-1 +@test search(astr, "H") == 1:1 +@test search(astr, "H", 2) == 0:-1 +@test search(astr, "l") == 3:3 +@test search(astr, "l", 4) == 4:4 +@test search(astr, "l", 5) == 11:11 +@test search(astr, "l", 12) == 0:-1 +@test search(astr, "\n") == 14:14 +@test search(astr, "\n", 15) == 0:-1 + +@test search(u8str, "z") == 0:-1 +@test search(u8str, "∄") == 0:-1 +@test search(u8str, "∀") == 1:1 +@test search(u8str, "∀", 4) == 0:-1 +@test search(u8str, "∃") == 13:13 +@test search(u8str, "∃", 16) == 0:-1 +@test search(u8str, "x") == 26:26 +@test search(u8str, "x", 27) == 43:43 +@test search(u8str, "x", 44) == 0:-1 +@test search(u8str, "ε") == 5:5 +@test search(u8str, "ε", 7) == 54:54 +@test search(u8str, "ε", 56) == 0:-1 + +# string rsearch with a single-char string +@test rsearch(astr, "x") == 0:-1 +@test rsearch(astr, "H") == 1:1 +@test rsearch(astr, "H", 2) == 1:1 +@test rsearch(astr, "H", 0) == 0:-1 +@test rsearch(astr, "l") == 11:11 +@test rsearch(astr, "l", 10) == 4:4 +@test rsearch(astr, "l", 4) == 4:4 +@test rsearch(astr, "l", 3) == 3:3 +@test rsearch(astr, "l", 2) == 0:-1 +@test rsearch(astr, "\n") == 14:14 +@test rsearch(astr, "\n", 13) == 0:-1 + +@test rsearch(u8str, "z") == 0:-1 +@test rsearch(u8str, "∄") == 0:-1 +@test rsearch(u8str, "∀") == 1:1 +@test rsearch(u8str, "∀", 0) == 0:-1 +#TODO: setting the limit in the middle of a wide char +# makes search fail but rsearch succeed. +# Should rsearch fail as well? +#@test rsearch(u8str, "∀", 2) == 0:-1 # gives 1:3 +@test rsearch(u8str, "∃") == 13:13 +@test rsearch(u8str, "∃", 12) == 0:-1 +@test rsearch(u8str, "x") == 43:43 +@test rsearch(u8str, "x", 42) == 26:26 +@test rsearch(u8str, "x", 25) == 0:-1 +@test rsearch(u8str, "ε") == 54:54 +@test rsearch(u8str, "ε", 53) == 5:5 +@test rsearch(u8str, "ε", 4) == 0:-1 + +# string search with a single-char regex +@test search(astr, r"x") == 0:-1 +@test search(astr, r"H") == 1:1 +@test search(astr, r"H", 2) == 0:-1 +@test search(astr, r"l") == 3:3 +@test search(astr, r"l", 4) == 4:4 +@test search(astr, r"l", 5) == 11:11 +@test search(astr, r"l", 12) == 0:-1 +@test search(astr, r"\n") == 14:14 +@test search(astr, r"\n", 15) == 0:-1 +@test search(u8str, r"z") == 0:-1 +@test search(u8str, r"∄") == 0:-1 +@test search(u8str, r"∀") == 1:1 +@test search(u8str, r"∀", 4) == 0:-1 +@test search(u8str, r"∀") == search(u8str, r"\u2200") +@test search(u8str, r"∀", 4) == search(u8str, r"\u2200", 4) +@test search(u8str, r"∃") == 13:13 +@test search(u8str, r"∃", 16) == 0:-1 +@test search(u8str, r"x") == 26:26 +@test search(u8str, r"x", 27) == 43:43 +@test search(u8str, r"x", 44) == 0:-1 +@test search(u8str, r"ε") == 5:5 +@test search(u8str, r"ε", 7) == 54:54 +@test search(u8str, r"ε", 56) == 0:-1 +for i = 1:endof(astr) + @test search(astr, r"."s, i) == i:i +end +for i = 1:endof(u8str) + if isvalid(u8str,i) + @test search(u8str, r"."s, i) == i:i + end +end + +# string search with a zero-char string +for i = 1:endof(astr) + @test search(astr, "", i) == i:i-1 +end +for i = 1:endof(u8str) + @test search(u8str, "", i) == i:i-1 +end +@test search("", "") == 1:0 + +# string rsearch with a zero-char string +for i = 1:endof(astr) + @test rsearch(astr, "", i) == i:i-1 +end +for i = 1:endof(u8str) + @test rsearch(u8str, "", i) == i:i-1 +end +@test rsearch("", "") == 1:0 + +# string search with a zero-char regex +for i = 1:endof(astr) + @test search(astr, r"", i) == i:i-1 +end +for i = 1:endof(u8str) + # TODO: should regex search fast-forward invalid indices? + if isvalid(u8str,i) + @test search(u8str, r""s, i) == i:i-1 + end +end + +# string search with a two-char string literal +@test search("foo,bar,baz", "xx") == 0:-1 +@test search("foo,bar,baz", "fo") == 1:2 +@test search("foo,bar,baz", "fo", 3) == 0:-1 +@test search("foo,bar,baz", "oo") == 2:3 +@test search("foo,bar,baz", "oo", 4) == 0:-1 +@test search("foo,bar,baz", "o,") == 3:4 +@test search("foo,bar,baz", "o,", 5) == 0:-1 +@test search("foo,bar,baz", ",b") == 4:5 +@test search("foo,bar,baz", ",b", 6) == 8:9 +@test search("foo,bar,baz", ",b", 10) == 0:-1 +@test search("foo,bar,baz", "az") == 10:11 +@test search("foo,bar,baz", "az", 12) == 0:-1 + +# issue #9365 +# string search with a two-char UTF-8 (2 byte) string literal +@test search("ééé", "éé") == 1:3 +@test search("ééé", "éé", 1) == 1:3 +# string search with a two-char UTF-8 (3 byte) string literal +@test search("€€€", "€€") == 1:4 +@test search("€€€", "€€", 1) == 1:4 +# string search with a two-char UTF-8 (4 byte) string literal +@test search("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 1:5 +@test search("\U1f596\U1f596\U1f596", "\U1f596\U1f596", 1) == 1:5 + +# string search with a two-char UTF-8 (2 byte) string literal +@test search("éé", "éé") == 1:3 +@test search("éé", "éé", 1) == 1:3 +# string search with a two-char UTF-8 (3 byte) string literal +@test search("€€", "€€") == 1:4 +@test search("€€", "€€", 1) == 1:4 +# string search with a two-char UTF-8 (4 byte) string literal +@test search("\U1f596\U1f596", "\U1f596\U1f596") == 1:5 +@test search("\U1f596\U1f596", "\U1f596\U1f596", 1) == 1:5 + +# string rsearch with a two-char UTF-8 (2 byte) string literal +@test rsearch("ééé", "éé") == 3:5 +@test rsearch("ééé", "éé", endof("ééé")) == 3:5 +# string rsearch with a two-char UTF-8 (3 byte) string literal +@test rsearch("€€€", "€€") == 4:7 +@test rsearch("€€€", "€€", endof("€€€")) == 4:7 +# string rsearch with a two-char UTF-8 (4 byte) string literal +@test rsearch("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 5:9 +@test rsearch("\U1f596\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 5:9 + +# string rsearch with a two-char UTF-8 (2 byte) string literal +@test rsearch("éé", "éé") == 1:3 # should really be 1:4! +@test rsearch("éé", "éé", endof("ééé")) == 1:3 +# string search with a two-char UTF-8 (3 byte) string literal +@test rsearch("€€", "€€") == 1:4 # should really be 1:6! +@test rsearch("€€", "€€", endof("€€€")) == 1:4 +# string search with a two-char UTF-8 (4 byte) string literal +@test rsearch("\U1f596\U1f596", "\U1f596\U1f596") == 1:5 # should really be 1:8! +@test rsearch("\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 1:5 + +# string rsearch with a two-char string literal +@test rsearch("foo,bar,baz", "xx") == 0:-1 +@test rsearch("foo,bar,baz", "fo") == 1:2 +@test rsearch("foo,bar,baz", "fo", 1) == 0:-1 +@test rsearch("foo,bar,baz", "oo") == 2:3 +@test rsearch("foo,bar,baz", "oo", 2) == 0:-1 +@test rsearch("foo,bar,baz", "o,") == 3:4 +@test rsearch("foo,bar,baz", "o,", 1) == 0:-1 +@test rsearch("foo,bar,baz", ",b") == 8:9 +@test rsearch("foo,bar,baz", ",b", 6) == 4:5 +@test rsearch("foo,bar,baz", ",b", 3) == 0:-1 +@test rsearch("foo,bar,baz", "az") == 10:11 +@test rsearch("foo,bar,baz", "az", 10) == 0:-1 + +# array rsearch +@test rsearch(UInt8[1,2,3],UInt8[2,3],3) == 2:3 +@test rsearch(UInt8[1,2,3],UInt8[2,3],1) == 0:-1 + +# string search with a two-char regex +@test search("foo,bar,baz", r"xx") == 0:-1 +@test search("foo,bar,baz", r"fo") == 1:2 +@test search("foo,bar,baz", r"fo", 3) == 0:-1 +@test search("foo,bar,baz", r"oo") == 2:3 +@test search("foo,bar,baz", r"oo", 4) == 0:-1 +@test search("foo,bar,baz", r"o,") == 3:4 +@test search("foo,bar,baz", r"o,", 5) == 0:-1 +@test search("foo,bar,baz", r",b") == 4:5 +@test search("foo,bar,baz", r",b", 6) == 8:9 +@test search("foo,bar,baz", r",b", 10) == 0:-1 +@test search("foo,bar,baz", r"az") == 10:11 +@test search("foo,bar,baz", r"az", 12) == 0:-1 + +@test searchindex("foo", 'o') == 2 +@test searchindex("foo", 'o', 3) == 3 + +# string searchindex with a two-char UTF-8 (2 byte) string literal +@test searchindex("ééé", "éé") == 1 +@test searchindex("ééé", "éé", 1) == 1 +# string searchindex with a two-char UTF-8 (3 byte) string literal +@test searchindex("€€€", "€€") == 1 +@test searchindex("€€€", "€€", 1) == 1 +# string searchindex with a two-char UTF-8 (4 byte) string literal +@test searchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 1 +@test searchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596", 1) == 1 + +# string searchindex with a two-char UTF-8 (2 byte) string literal +@test searchindex("éé", "éé") == 1 +@test searchindex("éé", "éé", 1) == 1 +# string searchindex with a two-char UTF-8 (3 byte) string literal +@test searchindex("€€", "€€") == 1 +@test searchindex("€€", "€€", 1) == 1 +# string searchindex with a two-char UTF-8 (4 byte) string literal +@test searchindex("\U1f596\U1f596", "\U1f596\U1f596") == 1 +@test searchindex("\U1f596\U1f596", "\U1f596\U1f596", 1) == 1 + +# string rsearchindex with a two-char UTF-8 (2 byte) string literal +@test rsearchindex("ééé", "éé") == 3 +@test rsearchindex("ééé", "éé", endof("ééé")) == 3 +# string rsearchindex with a two-char UTF-8 (3 byte) string literal +@test rsearchindex("€€€", "€€") == 4 +@test rsearchindex("€€€", "€€", endof("€€€")) == 4 +# string rsearchindex with a two-char UTF-8 (4 byte) string literal +@test rsearchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 5 +@test rsearchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 5 + +# string rsearchindex with a two-char UTF-8 (2 byte) string literal +@test rsearchindex("éé", "éé") == 1 +@test rsearchindex("éé", "éé", endof("ééé")) == 1 +# string searchindex with a two-char UTF-8 (3 byte) string literal +@test rsearchindex("€€", "€€") == 1 +@test rsearchindex("€€", "€€", endof("€€€")) == 1 +# string searchindex with a two-char UTF-8 (4 byte) string literal +@test rsearchindex("\U1f596\U1f596", "\U1f596\U1f596") == 1 +@test rsearchindex("\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 1 diff --git a/test/strings/types.jl b/test/strings/types.jl new file mode 100644 index 0000000000000..66c49b1e88ba0 --- /dev/null +++ b/test/strings/types.jl @@ -0,0 +1,196 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +## SubString, RevString, and RepString tests ## + +## SubString tests ## +u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" +u8str2 = u8str^2 +len_u8str = length(u8str) +slen_u8str = length(u8str) +len_u8str2 = length(u8str2) +slen_u8str2 = length(u8str2) + +@test len_u8str2 == 2 * len_u8str +@test slen_u8str2 == 2 * slen_u8str + +u8str2plain = utf8(u8str2) + +for i1 = 1:length(u8str2) + if !isvalid(u8str2, i1); continue; end + for i2 = i1:length(u8str2) + if !isvalid(u8str2, i2); continue; end + @test length(u8str2[i1:i2]) == length(u8str2plain[i1:i2]) + @test length(u8str2[i1:i2]) == length(u8str2plain[i1:i2]) + @test u8str2[i1:i2] == u8str2plain[i1:i2] + end +end + +str="tempus fugit" #length(str)==12 +ss=SubString(str,1,length(str)) #match source string +@test length(ss)==length(str) + +ss=SubString(str,1,0) #empty SubString +@test length(ss)==0 + +ss=SubString(str,14,20) #start indexed beyond source string length +@test length(ss)==0 + +ss=SubString(str,10,16) #end indexed beyond source string length +@test length(ss)==3 + +str2="" +ss=SubString(str2,1,4) #empty source string +@test length(ss)==0 + +ss=SubString(str2,1,1) #empty source string, identical start and end index +@test length(ss)==0 + +@test SubString("foobar",big(1),big(3)) == "foo" + +str = "aa\u2200\u2222bb" +u = SubString(str, 3, 6) +@test length(u)==2 +b = IOBuffer() +write(b, u) +@test takebuf_string(b) == "\u2200\u2222" + +str = "føøbar" +u = SubString(str, 4, 3) +@test length(u)==0 +b = IOBuffer() +write(b, u) +@test takebuf_string(b) == "" + +str = "føøbar" +u = SubString(str, 10, 10) +@test length(u)==0 +b = IOBuffer() +write(b, u) +@test takebuf_string(b) == "" + +# search and SubString (issue #5679) +str = "Hello, world!" +u = SubString(str, 1, 5) +@test rsearch(u, "World") == 0:-1 +@test rsearch(u, 'z') == 0 +@test rsearch(u, "ll") == 3:4 + +# sizeof +@test sizeof(SubString("abc\u2222def",4,4)) == 3 + +# issue #3710 +@test prevind(SubString("{var}",2,4),4) == 3 + +# issue #4183 +@test split(SubString(ascii("x"), 2, 0), "y") == AbstractString[""] +@test split(SubString(utf8("x"), 2, 0), "y") == AbstractString[""] + +# issue #6772 +@test float(SubString("10",1,1)) === 1.0 +@test float(SubString("1 0",1,1)) === 1.0 +@test parse(Float32,SubString("10",1,1)) === 1.0f0 + +# issue #5870 +@test !ismatch(Regex("aa"), SubString("",1,0)) +@test ismatch(Regex(""), SubString("",1,0)) + +# isvalid(), chr2ind() and ind2chr() for SubString{DirectIndexString} +let s="lorem ipsum", + sdict=Dict(SubString(s,1,11)=>s, + SubString(s,1,6)=>"lorem ", + SubString(s,1,0)=>"", + SubString(s,2,4)=>"ore", + SubString(s,2,16)=>"orem ipsum", + SubString(s,12,14)=>"" + ) + for (ss,s) in sdict + for i in -1:12 + @test isvalid(ss,i)==isvalid(s,i) + end + end + for (ss,s) in sdict + for i in 1:length(ss) + @test ind2chr(ss,i)==ind2chr(s,i) + end + end + for (ss,s) in sdict + for i in 1:length(ss) + @test chr2ind(ss,i)==chr2ind(s,i) + end + end +end #let + +#for isvalid(SubString{UTF8String}) +let s = utf8("Σx + βz - 2") + for i in -1:length(s)+2 + ss=SubString(s,1,i) + @test isvalid(ss,i)==isvalid(s,i) + end +end + +ss=SubString("hello",1,5) +@test_throws BoundsError ind2chr(ss, -1) +@test_throws BoundsError chr2ind(ss, -1) +@test_throws BoundsError chr2ind(ss, 10) +@test_throws BoundsError ind2chr(ss, 10) + +# length(SubString{UTF8String}) performance specialization +let s = "|η(α)-ϕ(κ)| < ε" + @test length(SubString(s,1,0))==length(s[1:0]) + @test length(SubString(s,4,4))==length(s[4:4]) + @test length(SubString(s,1,7))==length(s[1:7]) + @test length(SubString(s,4,11))==length(s[4:11]) +end + +## Reverse strings ## + +# issue #4586 +@test rsplit(RevString("ailuj"),'l') == ["ju","ia"] +@test parse(Float64,RevString("64")) === 46.0 + +# reverseind +for T in (ASCIIString, UTF8String, UTF16String, UTF32String) + for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1") + for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4") + for c in ('X', 'δ', '\U0001d6a5') + T != ASCIIString || (isascii(prefix) && isascii(suffix) && isascii(c)) || continue + s = convert(T, string(prefix, c, suffix)) + ri = search(reverse(s), c) + @test reverse(s) == RevString(s) + @test c == s[reverseind(s, ri)] == reverse(s)[ri] + s = RevString(s) + ri = search(reverse(s), c) + @test c == s[reverseind(s, ri)] == reverse(s)[ri] + s = convert(T, string(prefix, prefix, c, suffix, suffix)) + pre = convert(T, prefix) + sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix)))) + ri = search(reverse(sb), c) + @test c == sb[reverseind(sb, ri)] == reverse(sb)[ri] + end + end + end +end + +## Repeat strings ## + +# issue #7764 +let + srep = RepString("Σβ",2) + s="Σβ" + ss=SubString(s,1,endof(s)) + + @test ss^2 == "ΣβΣβ" + @test RepString(ss,2) == "ΣβΣβ" + + @test endof(srep) == 7 + + @test next(srep, 3) == ('β',5) + @test next(srep, 7) == ('β',9) + + @test srep[7] == 'β' + @test_throws BoundsError srep[8] +end + +## Rope strings ## + +@test sizeof(RopeString("abc","def")) == 6 diff --git a/test/strings/util.jl b/test/strings/util.jl new file mode 100644 index 0000000000000..65ab6c11c5526 --- /dev/null +++ b/test/strings/util.jl @@ -0,0 +1,210 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# padding (lpad and rpad) +@test lpad("foo", 3) == "foo" +@test rpad("foo", 3) == "foo" +@test lpad("foo", 5) == " foo" +@test rpad("foo", 5) == "foo " +@test lpad("foo", 5, " ") == " foo" +@test rpad("foo", 5, " ") == "foo " +@test lpad("foo", 6, " ") == " foo" +@test rpad("foo", 6, " ") == "foo " + +# string manipulation +@test strip("\t hi \n") == "hi" +@test strip("foobarfoo", ['f', 'o']) == "bar" + +# split +@test isequal(split("foo,bar,baz", 'x'), ["foo,bar,baz"]) +@test isequal(split("foo,bar,baz", ','), ["foo","bar","baz"]) +@test isequal(split("foo,bar,baz", ","), ["foo","bar","baz"]) +@test isequal(split("foo,bar,baz", r","), ["foo","bar","baz"]) +@test isequal(split("foo,bar,baz", ','; limit=0), ["foo","bar","baz"]) +@test isequal(split("foo,bar,baz", ','; limit=1), ["foo,bar,baz"]) +@test isequal(split("foo,bar,baz", ','; limit=2), ["foo","bar,baz"]) +@test isequal(split("foo,bar,baz", ','; limit=3), ["foo","bar","baz"]) +@test isequal(split("foo,bar", "o,b"), ["fo","ar"]) + +@test isequal(split("", ','), [""]) +@test isequal(split(",", ','), ["",""]) +@test isequal(split(",,", ','), ["","",""]) +@test isequal(split("", ',' ; keep=false), []) +@test isequal(split(",", ',' ; keep=false), []) +@test isequal(split(",,", ','; keep=false), []) + +@test isequal(split("a b c"), ["a","b","c"]) +@test isequal(split("a b \t c\n"), ["a","b","c"]) + +@test isequal(rsplit("foo,bar,baz", 'x'), ["foo,bar,baz"]) +@test isequal(rsplit("foo,bar,baz", ','), ["foo","bar","baz"]) +@test isequal(rsplit("foo,bar,baz", ","), ["foo","bar","baz"]) +@test isequal(rsplit("foo,bar,baz", ','; limit=0), ["foo","bar","baz"]) +@test isequal(rsplit("foo,bar,baz", ','; limit=1), ["foo,bar,baz"]) +@test isequal(rsplit("foo,bar,baz", ','; limit=2), ["foo,bar","baz"]) +@test isequal(rsplit("foo,bar,baz", ','; limit=3), ["foo","bar","baz"]) +@test isequal(rsplit("foo,bar", "o,b"), ["fo","ar"]) + +@test isequal(rsplit("", ','), [""]) +@test isequal(rsplit(",", ','), ["",""]) +@test isequal(rsplit(",,", ','), ["","",""]) +@test isequal(rsplit(",,", ','; limit=2), [",",""]) +@test isequal(rsplit("", ',' ; keep=false), []) +@test isequal(rsplit(",", ',' ; keep=false), []) +@test isequal(rsplit(",,", ','; keep=false), []) + +#@test isequal(rsplit("a b c"), ["a","b","c"]) +#@test isequal(rsplit("a b \t c\n"), ["a","b","c"]) + +let str = "a.:.ba..:..cba.:.:.dcba.:." +@test isequal(split(str, ".:."), ["a","ba.",".cba",":.dcba",""]) +@test isequal(split(str, ".:."; keep=false), ["a","ba.",".cba",":.dcba"]) +@test isequal(split(str, ".:."), ["a","ba.",".cba",":.dcba",""]) +@test isequal(split(str, r"\.(:\.)+"), ["a","ba.",".cba","dcba",""]) +@test isequal(split(str, r"\.(:\.)+"; keep=false), ["a","ba.",".cba","dcba"]) +@test isequal(split(str, r"\.+:\.+"), ["a","ba","cba",":.dcba",""]) +@test isequal(split(str, r"\.+:\.+"; keep=false), ["a","ba","cba",":.dcba"]) + +@test isequal(rsplit(str, ".:."), ["a","ba.",".cba.:","dcba",""]) +@test isequal(rsplit(str, ".:."; keep=false), ["a","ba.",".cba.:","dcba"]) +@test isequal(rsplit(str, ".:."; limit=2), ["a.:.ba..:..cba.:.:.dcba", ""]) +@test isequal(rsplit(str, ".:."; limit=3), ["a.:.ba..:..cba.:", "dcba", ""]) +@test isequal(rsplit(str, ".:."; limit=4), ["a.:.ba.", ".cba.:", "dcba", ""]) +@test isequal(rsplit(str, ".:."; limit=5), ["a", "ba.", ".cba.:", "dcba", ""]) +@test isequal(rsplit(str, ".:."; limit=6), ["a", "ba.", ".cba.:", "dcba", ""]) +end + +# zero-width splits +@test isequal(rsplit("", ""), [""]) + +@test isequal(split("", ""), [""]) +@test isequal(split("", r""), [""]) +@test isequal(split("abc", ""), ["a","b","c"]) +@test isequal(split("abc", r""), ["a","b","c"]) +@test isequal(split("abcd", r"b?"), ["a","c","d"]) +@test isequal(split("abcd", r"b*"), ["a","c","d"]) +@test isequal(split("abcd", r"b+"), ["a","cd"]) +@test isequal(split("abcd", r"b?c?"), ["a","d"]) +@test isequal(split("abcd", r"[bc]?"), ["a","","d"]) +@test isequal(split("abcd", r"a*"), ["","b","c","d"]) +@test isequal(split("abcd", r"a+"), ["","bcd"]) +@test isequal(split("abcd", r"d*"), ["a","b","c",""]) +@test isequal(split("abcd", r"d+"), ["abc",""]) +@test isequal(split("abcd", r"[ad]?"), ["","b","c",""]) + +# replace +@test replace("\u2202", '*', '\0') == "\u2202" + +@test replace("foobar", 'o', '0') == "f00bar" +@test replace("foobar", 'o', '0', 1) == "f0obar" +@test replace("foobar", 'o', "") == "fbar" +@test replace("foobar", 'o', "", 1) == "fobar" +@test replace("foobar", 'f', 'F') == "Foobar" +@test replace("foobar", 'r', 'R') == "foobaR" + +@test replace("foofoofoo", "foo", "bar") == "barbarbar" +@test replace("foobarfoo", "foo", "baz") == "bazbarbaz" +@test replace("barfoofoo", "foo", "baz") == "barbazbaz" + +@test replace("", "", "") == "" +@test replace("", "", "x") == "x" +@test replace("", "x", "y") == "" + +@test replace("abcd", "", "^") == "^a^b^c^d^" +@test replace("abcd", "b", "^") == "a^cd" +@test replace("abcd", r"b?", "^") == "^a^c^d^" +@test replace("abcd", r"b+", "^") == "a^cd" +@test replace("abcd", r"b?c?", "^") == "^a^d^" +@test replace("abcd", r"[bc]?", "^") == "^a^^d^" + +@test replace("foobarfoo", r"(fo|ba)", "xx") == "xxoxxrxxo" +@test replace("foobarfoo", r"(foo|ba)", "bar") == "barbarrbar" + +@test replace("foobar", 'o', 'ø') == "føøbar" +@test replace("foobar", 'o', 'ø', 1) == "føobar" +@test replace("føøbar", 'ø', 'o') == "foobar" +@test replace("føøbar", 'ø', 'o', 1) == "foøbar" +@test replace("føøbar", 'ø', 'ö') == "fööbar" +@test replace("føøbar", 'ø', 'ö', 1) == "föøbar" +@test replace("føøbar", 'ø', "") == "fbar" +@test replace("føøbar", 'ø', "", 1) == "føbar" +@test replace("føøbar", 'f', 'F') == "Føøbar" +@test replace("ḟøøbar", 'ḟ', 'F') == "Føøbar" +@test replace("føøbar", 'f', 'Ḟ') == "Ḟøøbar" +@test replace("ḟøøbar", 'ḟ', 'Ḟ') == "Ḟøøbar" +@test replace("føøbar", 'r', 'R') == "føøbaR" +@test replace("føøbaṙ", 'ṙ', 'R') == "føøbaR" +@test replace("føøbar", 'r', 'Ṙ') == "føøbaṘ" +@test replace("føøbaṙ", 'ṙ', 'Ṙ') == "føøbaṘ" + +@test replace("ḟøøḟøøḟøø", "ḟøø", "bar") == "barbarbar" +@test replace("ḟøøbarḟøø", "ḟøø", "baz") == "bazbarbaz" +@test replace("barḟøøḟøø", "ḟøø", "baz") == "barbazbaz" + +@test replace("foofoofoo", "foo", "ƀäṙ") == "ƀäṙƀäṙƀäṙ" +@test replace("fooƀäṙfoo", "foo", "baz") == "bazƀäṙbaz" +@test replace("ƀäṙfoofoo", "foo", "baz") == "ƀäṙbazbaz" + +@test replace("foofoofoo", "foo", "bar") == "barbarbar" +@test replace("foobarfoo", "foo", "ƀäż") == "ƀäżbarƀäż" +@test replace("barfoofoo", "foo", "ƀäż") == "barƀäżƀäż" + +@test replace("ḟøøḟøøḟøø", "ḟøø", "ƀäṙ") == "ƀäṙƀäṙƀäṙ" +@test replace("ḟøøƀäṙḟøø", "ḟøø", "baz") == "bazƀäṙbaz" +@test replace("ƀäṙḟøøḟøø", "ḟøø", "baz") == "ƀäṙbazbaz" + +@test replace("ḟøøḟøøḟøø", "ḟøø", "bar") == "barbarbar" +@test replace("ḟøøbarḟøø", "ḟøø", "ƀäż") == "ƀäżbarƀäż" +@test replace("barḟøøḟøø", "ḟøø", "ƀäż") == "barƀäżƀäż" + +@test replace("ḟøøḟøøḟøø", "ḟøø", "ƀäṙ") == "ƀäṙƀäṙƀäṙ" +@test replace("ḟøøƀäṙḟøø", "ḟøø", "ƀäż") == "ƀäżƀäṙƀäż" +@test replace("ƀäṙḟøøḟøø", "ḟøø", "ƀäż") == "ƀäṙƀäżƀäż" + +@test replace("", "", "ẍ") == "ẍ" +@test replace("", "ẍ", "ÿ") == "" + +@test replace("äƀçđ", "", "π") == "πäπƀπçπđπ" +@test replace("äƀçđ", "ƀ", "π") == "äπçđ" +@test replace("äƀçđ", r"ƀ?", "π") == "πäπçπđπ" +@test replace("äƀçđ", r"ƀ+", "π") == "äπçđ" +@test replace("äƀçđ", r"ƀ?ç?", "π") == "πäπđπ" +@test replace("äƀçđ", r"[ƀç]?", "π") == "πäππđπ" + +@test replace("foobarfoo", r"(fo|ba)", "ẍẍ") == "ẍẍoẍẍrẍẍo" + +@test replace("ḟøøbarḟøø", r"(ḟø|ba)", "xx") == "xxøxxrxxø" +@test replace("ḟøøbarḟøø", r"(ḟøø|ba)", "bar") == "barbarrbar" + +@test replace("fooƀäṙfoo", r"(fo|ƀä)", "xx") == "xxoxxṙxxo" +@test replace("fooƀäṙfoo", r"(foo|ƀä)", "ƀäṙ") == "ƀäṙƀäṙṙƀäṙ" + +@test replace("ḟøøƀäṙḟøø", r"(ḟø|ƀä)", "xx") == "xxøxxṙxxø" +@test replace("ḟøøƀäṙḟøø", r"(ḟøø|ƀä)", "ƀäṙ") == "ƀäṙƀäṙṙƀäṙ" + +@test replace("foo", "oo", uppercase) == "fOO" + +# chomp/chop +@test chomp("foo\n") == "foo" +@test chop("foob") == "foo" + +# bytes2hex and hex2bytes +hex_str = "d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592" +bin_val = hex2bytes(hex_str) + +@test div(length(hex_str), 2) == length(bin_val) +@test hex_str == bytes2hex(bin_val) + +bin_val = hex2bytes("07bf") +@test bin_val[1] == 7 +@test bin_val[2] == 191 +@test typeof(bin_val) == Array{UInt8, 1} +@test length(bin_val) == 2 + +# all valid hex chars +@test "0123456789abcdefabcdef" == bytes2hex(hex2bytes("0123456789abcdefABCDEF")) + +# odd size +@test_throws ArgumentError hex2bytes("0123456789abcdefABCDEF0") + +#non-hex characters +@test_throws ArgumentError hex2bytes("0123456789abcdefABCDEFGH") diff --git a/test/triplequote.jl b/test/triplequote.jl new file mode 100644 index 0000000000000..074afec18a22c --- /dev/null +++ b/test/triplequote.jl @@ -0,0 +1,68 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# triple-quote delimited strings +@test """abc""" == "abc" +@test """ab"c""" == "ab\"c" +@test """ab""c""" == "ab\"\"c" +@test """ab"\"c""" == "ab\"\"c" +@test """abc\"""" == "abc\"" +n = 3 +@test """$n\n""" == "$n\n" +@test """$(n)""" == "3" +@test """$(2n)""" == "6" +@test """$(n+4)""" == "7" +@test """$("string")""" == "string" +a = [3,1,2] +@test """$(a[2])""" == "1" +@test """$(a[3]+7)""" == "9" +@test """$(floor(Int,4.5))""" == "4" +nl = " +" +@test """ + a + b + + c + """ == "a$(nl)b$(nl)$(nl)c$(nl)" +@test """ + """ == "" +@test """x + a + """ == "x$(nl) a$(nl)" +@test """ + $n + """ == " $n$(nl)" +@test """ + a + b + c""" == " a$(nl)b$(nl) c" +# tabs + spaces +@test """ + a + b + """ == " a$(nl) b$(nl)" +@test """ + a + """ == "a$(nl) " +s = " p" +@test """ + $s""" == "$s" +@test """ + $s + """ == " $s$(nl)" +@test """\t""" == "\t" +@test """ + \t""" == "" +@test """ + foo + \tbar""" == "foo$(nl)\tbar" +@test """ + foo + \tbar + """ == "foo$(nl)\tbar$(nl)" +@test """ + foo + bar\t""" == "foo$(nl)bar\t" +@test """ + $("\n ") + """ == "\n $(nl)" diff --git a/test/unicode.jl b/test/unicode.jl index 6af8e8e63a527..1e3c384306cd0 100644 --- a/test/unicode.jl +++ b/test/unicode.jl @@ -1,140 +1,6 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license -# UTF16 -u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" -u16 = utf16(u8) -@test sizeof(u16) == 18 -@test length(u16.data) == 10 && u16.data[end] == 0 -@test length(u16) == 5 -@test utf8(u16) == u8 -@test collect(u8) == collect(u16) -@test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Array(UInt8, 18), 1, reinterpret(UInt8, u16.data), 1, 18)) -@test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16))) -@test_throws UnicodeError utf16(utf32(Char(0x120000))) -@test_throws UnicodeError utf16(UInt8[1,2,3]) - -# UTF32 -u32 = utf32(u8) -@test sizeof(u32) == 20 -@test length(u32.data) == 6 && u32.data[end] == Char(0) -@test length(u32) == 5 -@test utf8(u32) == u8 -@test collect(u8) == collect(u32) -@test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Array(UInt8, 20), 1, reinterpret(UInt8, u32.data), 1, 20)) -@test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32))) -@test_throws UnicodeError utf32(UInt8[1,2,3]) - -# Wstring -w = wstring(u8) -@test length(w) == 5 && utf8(w) == u8 && collect(u8) == collect(w) -@test u8 == WString(w.data) - -if !success(`iconv --version`) - warn("iconv not found, skipping unicode tests!") - @windows_only warn("Use WinRPM.install(\"win_iconv\") to run these tests") -else - # Create unicode test data directory - unicodedir = mktempdir() - - # Use perl to generate the primary data - primary_encoding = "UTF-32BE" - primary_path = replace(joinpath(unicodedir, primary_encoding*".unicode"),"\\","\\\\\\\\") - run(`perl -e " - $$fname = \"$primary_path\"; - open(UNICODEF, \">\", \"$$fname\") or die \"can\'t open $$fname: $$!\"; - binmode(UNICODEF); - print UNICODEF pack \"N*\", 0xfeff, 0..0xd7ff, 0xe000..0x10ffff; - close(UNICODEF);"` ) - - # Use iconv to generate the other data - for encoding in ["UTF-32LE", "UTF-16BE", "UTF-16LE", "UTF-8"] - output_path = joinpath(unicodedir, encoding*".unicode") - f = Base.FS.open(output_path,Base.JL_O_WRONLY|Base.JL_O_CREAT,Base.S_IRUSR | Base.S_IWUSR | Base.S_IRGRP | Base.S_IROTH) - run(pipe(`iconv -f $primary_encoding -t $encoding $primary_path`, f)) - Base.FS.close(f) - end - - f=open(joinpath(unicodedir,"UTF-32LE.unicode")) - str1 = utf32(read(f, UInt32, 1112065)[2:end]) - close(f) - - f=open(joinpath(unicodedir,"UTF-8.unicode")) - str2 = UTF8String(read(f, UInt8, 4382595)[4:end]) - close(f) - @test str1 == str2 - - @test str1 == open(joinpath(unicodedir,"UTF-16LE.unicode")) do f - utf16(read(f, UInt16, 2160641)[2:end]) - end - - @test str1 == open(joinpath(unicodedir,"UTF-16LE.unicode")) do f - utf16(read(f, UInt8, 2160641*2)) - end - @test str1 == open(joinpath(unicodedir,"UTF-16BE.unicode")) do f - utf16(read(f, UInt8, 2160641*2)) - end - - @test str1 == open(joinpath(unicodedir,"UTF-32LE.unicode")) do f - utf32(read(f, UInt8, 1112065*4)) - end - @test str1 == open(joinpath(unicodedir,"UTF-32BE.unicode")) do f - utf32(read(f, UInt8, 1112065*4)) - end - - str1 = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" - str2 = UTF32String(UInt32[ - 8704, 32, 949, 32, 62, 32, 48, 44, 32, 8707, 32, - 948, 32, 62, 32, 48, 58, 32, 124, 120, 45, 121, 124, - 32, 60, 32, 948, 32, 8658, 32, 124, 102, 40, 120, - 41, 45, 102, 40, 121, 41, 124, 32, 60, 32, 949 - ,0]) - @test str1 == str2 - - # Cleanup unicode data - for encoding in ["UTF-32BE", "UTF-32LE", "UTF-16BE", "UTF-16LE", "UTF-8"] - rm(joinpath(unicodedir,encoding*".unicode")) - end - rm(unicodedir) -end - -# check utf8proc handling of CN category constants -let c_ll = 'β', c_cn = '\u038B' - @test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL - # check codepoint with category code CN - @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN -end - -# graphemes -let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h", - "β","l","a","h", - "b\u0302","l","á","h"]), - ("", UTF8String[]), - ("x\u0302", ["x\u0302"]), - ("\U1d4c1\u0302", ["\U1d4c1\u0302"]), - ("\U1d4c1\u0302\U1d4c1\u0300", ["\U1d4c1\u0302", - "\U1d4c1\u0300"]), - ("x",["x"]), - ("abc",["a","b","c"])) - for T in (utf8,utf16,utf32) - for nf in (:NFC, :NFD) - for (s, g) in grphtest - s_ = T(normalize_string(s, nf)) - g_ = map(s -> normalize_string(s, nf), g) - grph = collect(graphemes(s_)) - @test grph == g_ - @test length(graphemes(s_)) == length(grph) - end - S = [T(normalize_string(s)) for (s,g) in grphtest] - G = map(graphemes, S) - @test map(graphemes, sort!(S)) == sort!(G) - end - end -end - -# up-to-date character widths (#3721, #6939) -@test charwidth('\U1f355') == strwidth("\U1f355") == strwidth(utf16("\U1f355")) == strwidth("\U1f355\u0302") == strwidth(utf16("\U1f355\u0302")) == 2 - -# handling of embedded NUL chars (#10958) -@test length("\0w") == length("\0α") == 2 -@test strwidth("\0w") == strwidth("\0α") == 1 -@test normalize_string("\0W", casefold=true) == "\0w" +include("unicode/checkstring.jl") +include("unicode/utf16.jl") +include("unicode/utf32.jl") +include("unicode/utf8proc.jl") \ No newline at end of file diff --git a/test/unicode/checkstring.jl b/test/unicode/checkstring.jl new file mode 100644 index 0000000000000..9777e1e0ceb51 --- /dev/null +++ b/test/unicode/checkstring.jl @@ -0,0 +1,162 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# 11575 +# Test invalid sequences + +byt = 0x0 # Needs to be defined outside the try block! +try + # Continuation byte not after lead + for byt in 0x80:0xbf + @test_throws UnicodeError Base.checkstring(UInt8[byt]) + end + + # Test lead bytes + for byt in 0xc0:0xff + # Single lead byte at end of string + @test_throws UnicodeError Base.checkstring(UInt8[byt]) + # Lead followed by non-continuation character < 0x80 + @test_throws UnicodeError Base.checkstring(UInt8[byt,0]) + # Lead followed by non-continuation character > 0xbf + @test_throws UnicodeError Base.checkstring(UInt8[byt,0xc0]) + end + + # Test overlong 2-byte + for byt in 0x81:0xbf + @test_throws UnicodeError Base.checkstring(UInt8[0xc0,byt]) + end + for byt in 0x80:0xbf + @test_throws UnicodeError Base.checkstring(UInt8[0xc1,byt]) + end + + # Test overlong 3-byte + for byt in 0x80:0x9f + @test_throws UnicodeError Base.checkstring(UInt8[0xe0,byt,0x80]) + end + + # Test overlong 4-byte + for byt in 0x80:0x8f + @test_throws UnicodeError Base.checkstring(UInt8[0xef,byt,0x80,0x80]) + end + + # Test 4-byte > 0x10ffff + for byt in 0x90:0xbf + @test_throws UnicodeError Base.checkstring(UInt8[0xf4,byt,0x80,0x80]) + end + for byt in 0xf5:0xf7 + @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80]) + end + + # Test 5-byte + for byt in 0xf8:0xfb + @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80,0x80]) + end + + # Test 6-byte + for byt in 0xfc:0xfd + @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80,0x80,0x80]) + end + + # Test 7-byte + @test_throws UnicodeError Base.checkstring(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]) + + # Three and above byte sequences + for byt in 0xe0:0xef + # Lead followed by only 1 continuation byte + @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80]) + # Lead ended by non-continuation character < 0x80 + @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0]) + # Lead ended by non-continuation character > 0xbf + @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0xc0]) + end + + # 3-byte encoded surrogate character(s) + # Single surrogate + @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80]) + # Not followed by surrogate + @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]) + # Trailing surrogate first + @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]) + # Followed by lead surrogate + @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]) + + # Four byte sequences + for byt in 0xf0:0xf4 + # Lead followed by only 2 continuation bytes + @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80]) + # Lead followed by non-continuation character < 0x80 + @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0]) + # Lead followed by non-continuation character > 0xbf + @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0xc0]) + end +catch exp; + println("Error testing checkstring: $byt, $exp") + throw(exp) +end + +# Surrogates +@test_throws UnicodeError Base.checkstring(UInt16[0xd800]) +@test_throws UnicodeError Base.checkstring(UInt16[0xdc00]) +@test_throws UnicodeError Base.checkstring(UInt16[0xdc00,0xd800]) + +# Surrogates in UTF-32 +@test_throws UnicodeError Base.checkstring(UInt32[0xd800]) +@test_throws UnicodeError Base.checkstring(UInt32[0xdc00]) +@test_throws UnicodeError Base.checkstring(UInt32[0xdc00,0xd800]) + +# Characters > 0x10ffff +@test_throws UnicodeError Base.checkstring(UInt32[0x110000]) + +# Test valid sequences +for (seq, res) in ( + (UInt8[0x0], (1,0,0,0,0)), # Nul byte, beginning of ASCII range + (UInt8[0x7f], (1,0,0,0,0)), # End of ASCII range + (UInt8[0xc0,0x80], (1,1,0,0,0)), # Long encoded Nul byte (Modified UTF-8, Java) + (UInt8[0xc2,0x80], (1,2,0,0,1)), # \u80, beginning of Latin1 range + (UInt8[0xc3,0xbf], (1,2,0,0,1)), # \uff, end of Latin1 range + (UInt8[0xc4,0x80], (1,4,0,0,1)), # \u100, beginning of non-Latin1 2-byte range + (UInt8[0xdf,0xbf], (1,4,0,0,1)), # \u7ff, end of non-Latin1 2-byte range + (UInt8[0xe0,0xa0,0x80], (1,8,0,1,0)), # \u800, beginning of 3-byte range + (UInt8[0xed,0x9f,0xbf], (1,8,0,1,0)), # \ud7ff, end of first part of 3-byte range + (UInt8[0xee,0x80,0x80], (1,8,0,1,0)), # \ue000, beginning of second part of 3-byte range + (UInt8[0xef,0xbf,0xbf], (1,8,0,1,0)), # \uffff, end of 3-byte range + (UInt8[0xf0,0x90,0x80,0x80],(1,16,1,0,0)), # \U10000, beginning of 4-byte range + (UInt8[0xf4,0x8f,0xbf,0xbf],(1,16,1,0,0)), # \U10ffff, end of 4-byte range + (UInt8[0xed,0xa0,0x80,0xed,0xb0,0x80], (1,0x30,1,0,0)), # Overlong \U10000, (CESU-8) + (UInt8[0xed,0xaf,0xbf,0xed,0xbf,0xbf], (1,0x30,1,0,0)), # Overlong \U10ffff, (CESU-8) + (UInt16[0x0000], (1,0,0,0,0)), # Nul byte, beginning of ASCII range + (UInt16[0x007f], (1,0,0,0,0)), # End of ASCII range + (UInt16[0x0080], (1,2,0,0,1)), # Beginning of Latin1 range + (UInt16[0x00ff], (1,2,0,0,1)), # End of Latin1 range + (UInt16[0x0100], (1,4,0,0,1)), # Beginning of non-Latin1 2-byte range + (UInt16[0x07ff], (1,4,0,0,1)), # End of non-Latin1 2-byte range + (UInt16[0x0800], (1,8,0,1,0)), # Beginning of 3-byte range + (UInt16[0xd7ff], (1,8,0,1,0)), # End of first part of 3-byte range + (UInt16[0xe000], (1,8,0,1,0)), # Beginning of second part of 3-byte range + (UInt16[0xffff], (1,8,0,1,0)), # End of 3-byte range + (UInt16[0xd800,0xdc00], (1,16,1,0,0)), # \U10000, beginning of 4-byte range + (UInt16[0xdbff,0xdfff], (1,16,1,0,0)), # \U10ffff, end of 4-byte range + (UInt32[0x0000], (1,0,0,0,0)), # Nul byte, beginning of ASCII range + (UInt32[0x007f], (1,0,0,0,0)), # End of ASCII range + (UInt32[0x0080], (1,2,0,0,1)), # Beginning of Latin1 range + (UInt32[0x00ff], (1,2,0,0,1)), # End of Latin1 range + (UInt32[0x0100], (1,4,0,0,1)), # Beginning of non-Latin1 2-byte range + (UInt32[0x07ff], (1,4,0,0,1)), # End of non-Latin1 2-byte range + (UInt32[0x0800], (1,8,0,1,0)), # Beginning of 3-byte range + (UInt32[0xd7ff], (1,8,0,1,0)), # End of first part of 3-byte range + (UInt32[0xe000], (1,8,0,1,0)), # Beginning of second part of 3-byte range + (UInt32[0xffff], (1,8,0,1,0)), # End of 3-byte range + (UInt32[0x10000], (1,16,1,0,0)), # \U10000, beginning of 4-byte range + (UInt32[0x10ffff], (1,16,1,0,0)), # \U10ffff, end of 4-byte range + (UInt32[0xd800,0xdc00], (1,0x30,1,0,0)),# Overlong \U10000, (CESU-8) + (UInt32[0xdbff,0xdfff], (1,0x30,1,0,0)))# Overlong \U10ffff, (CESU-8) + @test Base.checkstring(seq) == res +end + +# Test bounds checking +@test_throws BoundsError Base.checkstring(b"abcdef", -10) +@test_throws BoundsError Base.checkstring(b"abcdef", 0) +@test_throws BoundsError Base.checkstring(b"abcdef", 7) +@test_throws BoundsError Base.checkstring(b"abcdef", 3, -10) +@test_throws BoundsError Base.checkstring(b"abcdef", 3, 0) +@test_throws BoundsError Base.checkstring(b"abcdef", 3, 7) +@test_throws ArgumentError Base.checkstring(b"abcdef", 3, 1) diff --git a/test/unicode/utf16.jl b/test/unicode/utf16.jl new file mode 100644 index 0000000000000..7c5fbac5ae4ff --- /dev/null +++ b/test/unicode/utf16.jl @@ -0,0 +1,14 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# UTF16 +u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" +u16 = utf16(u8) +@test sizeof(u16) == 18 +@test length(u16.data) == 10 && u16.data[end] == 0 +@test length(u16) == 5 +@test utf8(u16) == u8 +@test collect(u8) == collect(u16) +@test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Array(UInt8, 18), 1, reinterpret(UInt8, u16.data), 1, 18)) +@test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16))) +@test_throws UnicodeError utf16(utf32(Char(0x120000))) +@test_throws UnicodeError utf16(UInt8[1,2,3]) diff --git a/test/unicode/utf32.jl b/test/unicode/utf32.jl new file mode 100644 index 0000000000000..15ddb1da56f74 --- /dev/null +++ b/test/unicode/utf32.jl @@ -0,0 +1,19 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# UTF32 +u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" +u32 = utf32(u8) +@test sizeof(u32) == 20 +@test length(u32.data) == 6 && u32.data[end] == Char(0) +@test length(u32) == 5 +@test utf8(u32) == u8 +@test collect(u8) == collect(u32) +@test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Array(UInt8, 20), 1, reinterpret(UInt8, u32.data), 1, 20)) +@test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32))) +@test_throws UnicodeError utf32(UInt8[1,2,3]) + +# Wstring +u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" +w = wstring(u8) +@test length(w) == 5 && utf8(w) == u8 && collect(u8) == collect(w) +@test u8 == WString(w.data) diff --git a/test/unicode/utf8proc.jl b/test/unicode/utf8proc.jl new file mode 100644 index 0000000000000..2963393cd1910 --- /dev/null +++ b/test/unicode/utf8proc.jl @@ -0,0 +1,259 @@ +# This file is a part of Julia. License is MIT: http://julialang.org/license + +# normalize_string (Unicode normalization etc.): +@test normalize_string("\u006e\u0303", :NFC) == "\u00f1" +@test "\u006e\u0303" == normalize_string("\u00f1", :NFD) +@test normalize_string("\ufb00", :NFC) != "ff" +@test normalize_string("\ufb00", :NFKC) == "ff" +@test normalize_string("\u006e\u0303\ufb00", :NFKC) == "\u00f1"*"ff" +@test normalize_string("\u00f1\ufb00", :NFKD) == "\u006e\u0303"*"ff" +@test normalize_string("\u006e\u0303", compose=true) == "\u00f1" +@test "\u006e\u0303" == normalize_string("\u00f1", decompose=true) +@test normalize_string("\u006e\u0303\u00b5",compat=true) == "\u00f1\u03bc" +@test normalize_string("Σσς",casefold=true) == "σσσ" +@test normalize_string("∕⁄", lump=true) == "//" +@test normalize_string("\ua\n\r\r\ua", newline2lf=true) == "\ua\ua\ua\ua" +@test normalize_string("\ua\n\r\r\ua", newline2ls=true) == "\u2028\u2028\u2028\u2028" +@test normalize_string("\ua\n\r\r\ua", newline2ps=true) == "\u2029\u2029\u2029\u2029" +@test normalize_string("\u00f1", stripmark=true) == "n" +@test isempty(normalize_string("\u00ad", stripignore=true)) +@test normalize_string("\t\r", stripcc=true) == " " +@test normalize_string("\t\r", stripcc=true, newline2ls=true) == " \u2028" + +#Tests from Unicode SA#15, "Unicode normalization forms" +#http://www.unicode.org/reports/tr15/ + +#1. Canonical equivalence +let ==(a::Array{Char},b::Array{Char}) = normalize_string(string(a...), :NFC)==normalize_string(string(b...), :NFC) + ==(a,b) = Base.(:(==))(a,b) + @test ['C', '̧'] == ['Ç'] + @test ['q', '̇', '̣'] == ['q', '̣', '̇'] + @test ['가'] == ['ᄀ', 'ᅡ'] + @test ['Ω'] == ['Ω'] +end + +#2. Compatibility Equivalence +let ==(a::Array{Char},b::Array{Char}) = normalize_string(string(a...), :NFKC)==normalize_string(string(b...), :NFKC) + ==(a,b) = Base.(:(==))(a,b) + @test ['ℌ'] == ['ℍ'] == ['H'] + @test ['ﻨ'] == ['ﻧ'] == ['ﻦ'] == ['ﻥ'] + @test ['①'] == ['1'] + @test ['カ'] == ['カ'] + @test ['︷'] == ['{'] + @test ['⁹'] == ['₉'] + @test ['㌀'] == ['ア', 'パ', 'ー', 'ト'] + @test ['¼'] == ['1', '⁄', '4'] + @test ['dž'] == ['d', 'ž'] +end + +#3. Singletons +@test normalize_string("\U212b", :NFD) == "A\U030a" +@test normalize_string("\U212b", :NFC) == "\U00c5" +@test normalize_string("\U2126", :NFC) == normalize_string("\U2126", :NFD) == "\U03a9" + +#4. Canonical Composites +@test normalize_string("\U00c5", :NFC) == "\U00c5" +@test normalize_string("\U00c5", :NFD) == "A\U030a" +@test normalize_string("\U00f4", :NFC) == "\U00f4" +@test normalize_string("\U00f4", :NFD) == "o\U0302" + +#5. Multiple Combining Marks +@test normalize_string("\U1e69", :NFD) == "s\U0323\U0307" +@test normalize_string("\U1e69", :NFC) == "\U1e69" +@test normalize_string("\U1e0b\U0323", :NFD) == "d\U0323\U0307" +@test normalize_string("\U1e0b\U0323", :NFC) == "\U1e0d\U0307" +@test normalize_string("q\U0307\U0323", :NFC) == "q\U0323\U0307" +@test normalize_string("q\U0307\U0323", :NFD) == "q\U0323\U0307" + +#6. Compatibility Composites +@test normalize_string("\Ufb01", :NFD) == normalize_string("\Ufb01", :NFC) == "\Ufb01" +@test normalize_string("\Ufb01", :NFKD) == normalize_string("\Ufb01", :NFKC) == "fi" +@test normalize_string("2\U2075", :NFD) == normalize_string("2\U2075", :NFC) == "2\U2075" +@test normalize_string("2\U2075", :NFKD) == normalize_string("2\U2075", :NFKC) == "25" +@test normalize_string("\U1e9b\U0323", :NFD) == "\U017f\U0323\U0307" +@test normalize_string("\U1e9b\U0323", :NFC) == "\U1e9b\U0323" +@test normalize_string("\U1e9b\U0323", :NFKD) == "s\U0323\U0307" +@test normalize_string("\U1e9b\U0323", :NFKC) == "\U1e69" + +#issue #5939 uft8proc/libmojibake character predicates +let + alower=['a', 'd', 'j', 'y', 'z'] + ulower=['α', 'β', 'γ', 'δ', 'ф', 'я'] + for c in vcat(alower,ulower) + @test islower(c) == true + @test isupper(c) == false + @test isdigit(c) == false + @test isnumber(c) == false + end + + aupper=['A', 'D', 'J', 'Y', 'Z'] + uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'Dž', 'Ж', 'Д'] + + for c in vcat(aupper,uupper) + @test islower(c) == false + @test isupper(c) == true + @test isdigit(c) == false + @test isnumber(c) == false + end + + nocase=['א','ﺵ'] + alphas=vcat(alower,ulower,aupper,uupper,nocase) + + for c in alphas + @test isalpha(c) == true + @test isnumber(c) == false + end + + + anumber=['0', '1', '5', '9'] + unumber=['٣', '٥', '٨', '¹', 'ⅳ' ] + + for c in anumber + @test isdigit(c) == true + @test isnumber(c) == true + end + for c in unumber + @test isdigit(c) == false + @test isnumber(c) == true + end + + alnums=vcat(alphas,anumber,unumber) + for c in alnums + @test isalnum(c) == true + @test ispunct(c) == false + end + + asymbol = ['(',')', '~', '$' ] + usymbol = ['∪', '∩', '⊂', '⊃', '√', '€', '¥', '↰', '△', '§'] + + apunct =['.',',',';',':','&'] + upunct =['‡', '؟', '჻' ] + + for c in vcat(apunct,upunct) + @test ispunct(c) == true + @test isalnum(c) == false + end + + for c in vcat(alnums,asymbol,usymbol,apunct,upunct) + @test isprint(c) == true + @test isgraph(c) == true + @test isspace(c) == false + @test iscntrl(c) == false + end + + NBSP = Char(0x0000A0) + ENSPACE = Char(0x002002) + EMSPACE = Char(0x002003) + THINSPACE = Char(0x002009) + ZWSPACE = Char(0x002060) + + uspace = [ENSPACE, EMSPACE, THINSPACE] + aspace = [' '] + acntrl_space = ['\t', '\n', '\v', '\f', '\r'] + for c in vcat(aspace,uspace) + @test isspace(c) == true + @test isprint(c) == true + @test isgraph(c) == false + end + + for c in vcat(acntrl_space) + @test isspace(c) == true + @test isprint(c) == false + @test isgraph(c) == false + end + + @test isspace(ZWSPACE) == false # zero-width space + + acontrol = [ Char(0x001c), Char(0x001d), Char(0x001e), Char(0x001f)] + latincontrol = [ Char(0x0080), Char(0x0085) ] + ucontrol = [ Char(0x200E), Char(0x202E) ] + + for c in vcat(acontrol, acntrl_space, latincontrol) + @test iscntrl(c) == true + @test isalnum(c) == false + @test isprint(c) == false + @test isgraph(c) == false + end + + for c in ucontrol #non-latin1 controls + if c!=Char(0x0085) + @test iscntrl(c) == false + @test isspace(c) == false + @test isalnum(c) == false + @test isprint(c) == false + @test isgraph(c) == false + end + end + +end + +@test isspace(" \t \n \r ")==true +@test isgraph(" \t \n \r ")==false +@test isprint(" \t \n \r ")==false +@test isalpha(" \t \n \r ")==false +@test isnumber(" \t \n \r ")==false +@test ispunct(" \t \n \r ")==false + +@test isspace("ΣβΣβ")==false +@test isalpha("ΣβΣβ")==true +@test isgraph("ΣβΣβ")==true +@test isprint("ΣβΣβ")==true +@test isupper("ΣβΣβ")==false +@test islower("ΣβΣβ")==false +@test isnumber("ΣβΣβ")==false +@test iscntrl("ΣβΣβ")==false +@test ispunct("ΣβΣβ")==false + +@test isnumber("23435")==true +@test isdigit("23435")==true +@test isalnum("23435")==true +@test isalpha("23435")==false +@test iscntrl( string(Char(0x0080))) == true +@test ispunct( "‡؟჻") ==true + +@test isxdigit('0') == true +@test isxdigit("0") == true +@test isxdigit("a") == true +@test isxdigit("g") == false + +# check utf8proc handling of CN category constants +let c_ll = 'β', c_cn = '\u038B' + @test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL + # check codepoint with category code CN + @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN +end + +# graphemes +let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h", + "β","l","a","h", + "b\u0302","l","á","h"]), + ("", UTF8String[]), + ("x\u0302", ["x\u0302"]), + ("\U1d4c1\u0302", ["\U1d4c1\u0302"]), + ("\U1d4c1\u0302\U1d4c1\u0300", ["\U1d4c1\u0302", + "\U1d4c1\u0300"]), + ("x",["x"]), + ("abc",["a","b","c"])) + for T in (utf8,utf16,utf32) + for nf in (:NFC, :NFD) + for (s, g) in grphtest + s_ = T(normalize_string(s, nf)) + g_ = map(s -> normalize_string(s, nf), g) + grph = collect(graphemes(s_)) + @test grph == g_ + @test length(graphemes(s_)) == length(grph) + end + S = [T(normalize_string(s)) for (s,g) in grphtest] + G = map(graphemes, S) + @test map(graphemes, sort!(S)) == sort!(G) + end + end +end + +# up-to-date character widths (#3721, #6939) +@test charwidth('\U1f355') == strwidth("\U1f355") == strwidth(utf16("\U1f355")) == strwidth("\U1f355\u0302") == strwidth(utf16("\U1f355\u0302")) == 2 + +# handling of embedded NUL chars (#10958) +@test length("\0w") == length("\0α") == 2 +@test strwidth("\0w") == strwidth("\0α") == 1 +@test normalize_string("\0W", casefold=true) == "\0w"