From 9ac1c76ce4cccb1d18035003e900a3574ba897b9 Mon Sep 17 00:00:00 2001 From: jacobwilliams Date: Sun, 3 May 2015 16:38:01 -0500 Subject: [PATCH 1/2] first attempt to merge the "speed" and "unicode" features together. Probably doesn't work yet. --- src/json_module.F90 | 591 ++++++++++++++++++++++++++------------- src/tests/jf_test_11.F90 | 2 + 2 files changed, 393 insertions(+), 200 deletions(-) diff --git a/src/json_module.F90 b/src/json_module.F90 index ee88aa7716..d7c47ba3d1 100644 --- a/src/json_module.F90 +++ b/src/json_module.F90 @@ -130,51 +130,68 @@ module json_module !********************************************************* !********************************************************* - !****M* json_module/STRING_KIND + !****d* json_module/CDK ! ! NAME - ! STRING_KIND + ! CDK ! ! DESCRIPTION - ! String kind preprocessor macro. + ! Processor dependendant 'DEFAULT' character kind. + ! This is 1 byte for the Intel and Gfortran compilers. + ! + ! NOTES + ! CK and CDK are the json-fortran character kind and json-fortran default + ! character kind respectively. Client code must ensure characters of kind=CK + ! are used for all character variables and strings passed to the json-fortran + ! library *EXCEPT* for file names which must be of 'DEFAULT' character kind, + ! provided here as CDK. In particular, any: + ! * file name + ! * format statement + ! * file path + ! passed to the json-fortran library *MUST* be of type CDK. This + ! will be the case for all string literals nor prepended with CK_ and only + ! if ISO 10646 is supported and enabled, will strings of kind CK be different + ! than CDK ! ! SOURCE -# define STRING_KIND 'DEFAULT' - ! this is the string kind to use unless compiling with GFortran AND - ! UCS4/ISO 10646 support is requested -# ifdef __GFORTRAN__ -# ifdef USE_UCS4 - ! gfortran compiler AND UCS4 support requested, & silence redefine warning: -# undef STRING_KIND -# define STRING_KIND 'ISO_10646' -# endif -# endif + integer,parameter,public :: CDK = selected_char_kind('DEFAULT') !********************************************************* !********************************************************* - !****M* json_module/FILE_ENCODING + !****d* json_module/LK ! ! NAME - ! FILE_ENCODING + ! LK ! ! DESCRIPTION - ! File encoding preprocessor macro. + ! Default logical kind. + ! This is 4 bytes for the Intel and Gfortran compilers + ! (and perhaps others). + ! The declaration ensures a valid kind + ! if the compiler doesn't have a logical_kinds(3). ! ! SOURCE -# define FILE_ENCODING - ! don't ask for utf-8 file encoding unless using UCS4 - ! this may let us use unformatted stream io to read in files more quickly - ! even with unicode support turned on `inquire( ... encoding=FL_ENCODING)` - ! may be able to detect json files in which each character is exactly one - ! byte -# ifdef __GFORTRAN__ -# ifdef USE_UCS4 - ! gfortran compiler AND UCS4 support requested, & silence redefine warning: - ! Make sure we output files with utf-8 encoding too -# undef FILE_ENCODING -# define FILE_ENCODING ,encoding='utf-8' -# endif -# endif + integer,parameter :: LK = logical_kinds(min(3,size(logical_kinds))) + !********************************************************* + + !********************************************************* + !****M* json_module/json_fortran_string_kind + ! + ! NAME + ! json_fortran_string_kind + ! + ! DESCRIPTION + ! String kind preprocessor macro. + ! + ! SOURCE +#if defined __GFORTRAN__ && defined USE_UCS4 + ! gfortran compiler AND UCS4 support requested: + character(kind=CDK,len=*),parameter :: json_fortran_string_kind = 'ISO_10646' +#else + ! this is the string kind to use unless compiling with GFortran AND + ! UCS4/ISO 10646 support is requested + character(kind=CDK,len=*),parameter :: json_fortran_string_kind = 'DEFAULT' +#endif !********************************************************* !********************************************************* @@ -206,52 +223,31 @@ module json_module ! STRING_KIND ! ! SOURCE - integer,parameter,public :: CK = selected_char_kind( STRING_KIND ) + integer,parameter,public :: CK = selected_char_kind(json_fortran_string_kind) !********************************************************* !********************************************************* - !****d* json_module/CDK - ! - ! NAME - ! CDK - ! - ! DESCRIPTION - ! Processor dependendant 'DEFAULT' character kind. - ! This is 1 byte for the Intel and Gfortran compilers. - ! - ! NOTES - ! CK and CDK are the json-fortran character kind and json-fortran default - ! character kind respectively. Client code must ensure characters of kind=CK - ! are used for all character variables and strings passed to the json-fortran - ! library *EXCEPT* for file names which must be of 'DEFAULT' character kind, - ! provided here as CDK. In particular, any: - ! * file name - ! * format statement - ! * file path - ! passed to the json-fortran library *MUST* be of type CDK. This - ! will be the case for all string literals nor prepended with CK_ and only - ! if ISO 10646 is supported and enabled, will strings of kind CK be different - ! than CDK - ! - ! SOURCE - integer,parameter,public :: CDK = selected_char_kind('DEFAULT') - !********************************************************* - - !********************************************************* - !****d* json_module/LK + !****M* json_module/FILE_ENCODING ! ! NAME - ! LK + ! FILE_ENCODING ! ! DESCRIPTION - ! Default logical kind. - ! This is 4 bytes for the Intel and Gfortran compilers - ! (and perhaps others). - ! The declaration ensures a valid kind - ! if the compiler doesn't have a logical_kinds(3). + ! File encoding preprocessor macro. ! ! SOURCE - integer,parameter :: LK = logical_kinds(min(3,size(logical_kinds))) +#if defined __GFORTRAN__ && defined USE_UCS4 + ! gfortran compiler AND UCS4 support requested, & silence redefine warning: + ! Make sure we output files with utf-8 encoding too +#define FILE_ENCODING ,encoding='UTF-8' +#else + ! don't ask for utf-8 file encoding unless using UCS4 + ! this may let us use unformatted stream io to read in files more quickly + ! even with unicode support turned on `inquire( ... encoding=FL_ENCODING)` + ! may be able to detect json files in which each character is exactly one + ! byte +#define FILE_ENCODING +#endif !********************************************************* !********************************************************* @@ -296,6 +292,30 @@ module json_module # endif !********************************************************* + !********************************************************* + !****d* json_module/use_unformatted_stream + ! + ! NAME + ! use_unformatted_stream + ! + ! DESCRIPTION + ! If using GFortran and Unicode is enabled, then + ! JSON files are opened using access='STREAM' and + ! form='UNFORMATTED'. This allows the file to + ! be read faster. + ! + ! SOURCE +#if defined __GFORTRAN__ && defined USE_UCS4 + logical,parameter :: use_unformatted_stream = .false. + character(kind=CDK,len=*),parameter :: access_spec = 'SEQUENTIAL' + character(kind=CDK,len=*),parameter :: form_spec = 'FORMATTED' +#else + logical,parameter :: use_unformatted_stream = .true. + character(kind=CDK,len=*),parameter :: access_spec = 'STREAM' + character(kind=CDK,len=*),parameter :: form_spec = 'UNFORMATTED' +#endif + !********************************************************* + !JSON file extension character(kind=CDK,len=*),parameter,public :: json_ext = '.json' !JSON file extension @@ -341,6 +361,10 @@ module json_module character(kind=CDK,len=*),parameter :: int_fmt = '(I0)' !minimum width format for integers character(kind=CK, len=*),parameter :: star = '*' !for invalid numbers + !for allocatable strings: + integer(IK),parameter :: chunk_size = 100 !allocate chunks of this size + integer(IK) :: ipos = 1 !next character to read + !********************************************************* !****d* json_module/var_type ! @@ -377,7 +401,7 @@ module json_module ! EXAMPLE ! Consider the following example: ! type(json_value),pointer :: p - ! call json_create_object(p) + ! call json_create_object(p,'') !root ! call json_add(p,'year',1805) ! call json_add(p,'value',1.0d0) ! call json_print(p,'test.json') @@ -2096,6 +2120,7 @@ subroutine json_initialize(verbose,compact_reals) pushed_char = '' char_count = 0 line_count = 1 + ipos = 1 end subroutine json_initialize !***************************************************************************************** @@ -3736,34 +3761,74 @@ subroutine escape_string(str_in, str_out) character(kind=CK,len=*),intent(in) :: str_in character(kind=CK,len=:),allocatable,intent(out) :: str_out - integer(IK) :: i + integer(IK) :: i,ipos character(kind=CK,len=1) :: c - str_out = '' - - !go through the string and look for special characters: - do i=1,len(str_in) - - c = str_in(i:i) !get next character in the input string - - select case(c) - case(quotation_mark,backslash,slash) - str_out = str_out//backslash//c - case(bspace) - str_out = str_out//'\b' - case(formfeed) - str_out = str_out//'\f' - case(newline) - str_out = str_out//'\n' - case(carriage_return) - str_out = str_out//'\r' - case(horizontal_tab) - str_out = str_out//'\t' - case default - str_out = str_out//c - end select + character(kind=CK,len=*),parameter :: specials = quotation_mark//& + backslash//& + slash//& + bspace//& + formfeed//& + newline//& + carriage_return//& + horizontal_tab + + !Do a quick scan for the special characters, + ! if any are present, then process the string, + ! otherwise, return the string as is. + if (scan(str_in,specials)>0) then + + str_out = repeat(space,chunk_size) + ipos = 1 + + !go through the string and look for special characters: + do i=1,len(str_in) + + c = str_in(i:i) !get next character in the input string + + !if the string is not big enough, then add another chunk: + if (ipos+3>len(str_out)) str_out = str_out // repeat(space, chunk_size) + + select case(c) + case(quotation_mark,backslash,slash) + str_out(ipos:ipos+1) = backslash//c + ipos = ipos + 2 + case(bspace) + str_out(ipos:ipos+1) = '\b' + ipos = ipos + 2 + case(formfeed) + str_out(ipos:ipos+1) = '\f' + ipos = ipos + 2 + case(newline) + str_out(ipos:ipos+1) = '\n' + ipos = ipos + 2 + case(carriage_return) + str_out(ipos:ipos+1) = '\r' + ipos = ipos + 2 + case(horizontal_tab) + str_out(ipos:ipos+1) = '\t' + ipos = ipos + 2 + case default + str_out(ipos:ipos) = c + ipos = ipos + 1 + end select - end do + end do + + !trim the string if necessary: + if (iposlen(string)) string = string // repeat(space, chunk_size) + !append to string: - string = string//c + string(ip:ip) = c + ip = ip + 1 !hex validation: if (is_hex) then !accumulate the four characters after '\u' @@ -7378,8 +7502,8 @@ subroutine parse_string(unit, str, string) hex = '' is_hex = .false. else - call throw_exception('Error in parse_string: '//& - 'invalid hex string: \u'//trim(hex)) + call throw_exception('Error in parse_string:'//& + ' invalid hex string: \u'//trim(hex)) exit end if end if @@ -7406,9 +7530,44 @@ subroutine parse_string(unit, str, string) end if + !trim the string if necessary: + if (iplen(str)) str = str // repeat(space, chunk_size) + else + str = repeat(space, chunk_size) + ip = 1 + end if + + !append to string: + str(ip:ip) = c + ip = ip + 1 + + end subroutine add_character_to_string +!***************************************************************************************** + !***************************************************************************************** !****if* json_module/parse_for_chars ! @@ -7424,9 +7583,9 @@ subroutine parse_for_chars(unit, str, chars) implicit none - integer(IK), intent(in) :: unit - character(kind=CK,len=:),allocatable,intent(inout) :: str - character(kind=CK,len = *), intent(in) :: chars + integer(IK), intent(in) :: unit + character(kind=CK,len=*),intent(in) :: str + character(kind=CK,len=*), intent(in) :: chars integer(IK) :: i, length logical(LK) :: eof @@ -7439,12 +7598,12 @@ subroutine parse_for_chars(unit, str, chars) do i = 1, length c = pop_char(unit, str=str, eof = eof, skip_ws = .true.) if (eof) then - call throw_exception('Error in parse_for_chars: '//& - 'Unexpected end of file while parsing array.') + call throw_exception('Error in parse_for_chars:'//& + ' Unexpected end of file while parsing array.') return else if (c /= chars(i:i)) then - call throw_exception('Error in parse_for_chars: '//& - 'Unexpected character.: "'//c//'" '//chars(i:i)) + call throw_exception('Error in parse_for_chars:'//& + ' Unexpected character.: "'//c//'" '//chars(i:i)) return end if end do @@ -7476,9 +7635,9 @@ subroutine parse_number(unit, str, value) implicit none - integer(IK),intent(in) :: unit - character(kind=CK,len=:),allocatable,intent(inout) :: str - type(json_value),pointer :: value + integer(IK),intent(in) :: unit + character(kind=CK,len=*),intent(in) :: str + type(json_value),pointer :: value character(kind=CK,len=:),allocatable :: tmp character(kind=CK,len=1) :: c @@ -7488,9 +7647,13 @@ subroutine parse_number(unit, str, value) logical(LK) :: first logical(LK) :: is_integer + !to speed up by reducing the number of character string reallocations: + integer(IK) :: ip !index to put next character + if (.not. exception_thrown) then - tmp = '' + tmp = repeat(space, chunk_size) + ip = 1 first = .true. is_integer = .true. !assume it may be an integer, unless otherwise determined @@ -7512,19 +7675,28 @@ subroutine parse_number(unit, str, value) if (is_integer .and. (.not. first)) is_integer = .false. !add it to the string: - tmp = tmp // c + !tmp = tmp // c !...original + if (ip>len(tmp)) tmp = tmp // repeat(space, chunk_size) + tmp(ip:ip) = c + ip = ip + 1 case(CK_'.',CK_'E',CK_'e') !can be present in real numbers if (is_integer) is_integer = .false. !add it to the string: - tmp = tmp // c + !tmp = tmp // c !...original + if (ip>len(tmp)) tmp = tmp // repeat(space, chunk_size) + tmp(ip:ip) = c + ip = ip + 1 case(CK_'0':CK_'9') !valid characters for numbers !add it to the string: - tmp = tmp // c + !tmp = tmp // c !...original + if (ip>len(tmp)) tmp = tmp // repeat(space, chunk_size) + tmp(ip:ip) = c + ip = ip + 1 case default @@ -7579,22 +7751,20 @@ recursive function pop_char(unit, str, eof, skip_ws) result(popped) implicit none - character(kind=CK,len=1) :: popped - integer(IK),intent(in) :: unit - character(kind=CK,len=:),allocatable,intent(inout) :: str !only used if unit=0 - logical(LK),intent(out) :: eof - logical(LK),intent(in),optional :: skip_ws + character(kind=CK,len=1) :: popped + integer(IK),intent(in) :: unit + character(kind=CK,len=*),intent(in) :: str !only used if unit=0 + logical(LK),intent(out) :: eof + logical(LK),intent(in),optional :: skip_ws - integer(IK) :: ios + integer(IK) :: ios,str_len character(kind=CK,len=1) :: c logical(LK) :: ignore - integer(IK) :: str_len - character(kind=CK,len=:),allocatable :: tmp !workaround for bug in gfortran 4.9.2 compiler if (.not. exception_thrown) then eof = .false. - if (.not.present(skip_ws)) then + if (.not. present(skip_ws)) then ignore = .false. else ignore = skip_ws @@ -7605,44 +7775,52 @@ recursive function pop_char(unit, str, eof, skip_ws) result(popped) if (pushed_index > 0) then ! there is a character pushed back on, most likely from the number parsing + ! NOTE: this can only occur if reading from a file when use_unformatted_stream=.false. c = pushed_char(pushed_index:pushed_index) pushed_index = pushed_index - 1 else if (unit/=0) then !read from the file - read (unit = unit, fmt = '(A1)', advance = 'NO', iostat = ios) c + + !read the next character: + if (use_unformatted_stream) then + read(unit=unit,pos=ipos,iostat=ios) c + else + read(unit=unit,fmt='(A1)',advance='NO',iostat=ios) c + end if + ipos = ipos + 1 + + !....note: maybe try read the file in chunks... + !.... or use asynchronous read with double buffering + ! (see Modern Fortran: Style and Usage) + else !read from the string - tmp = str !!! copy to a temp variable to workaround a bug in gfortran 4.9.2 - str_len = len(tmp) !length of the string - if (str_len>0) then - c = tmp(1:1) - if (str_len>1) then - tmp = tmp(2:str_len) !remove the character that was read - else - tmp = '' !that was the last one - end if - str = tmp - deallocate(tmp) !!! + + str_len = len(str) !length of the string + if (ipos<=str_len) then + c = str(ipos:ipos) ios = 0 else ios = IOSTAT_END !end of the string end if + ipos = ipos + 1 + end if char_count = char_count + 1 !character count in the current line - if (IS_IOSTAT_EOR(ios) .or. c==newline) then !end of record + if (IS_IOSTAT_END(ios)) then !end of file char_count = 0 - line_count = line_count + 1 - cycle + eof = .true. + exit - else if (IS_IOSTAT_END(ios)) then !end of file + elseif (IS_IOSTAT_EOR(ios) .or. c==newline) then !end of record char_count = 0 - eof = .true. - exit + line_count = line_count + 1 + cycle end if @@ -7683,26 +7861,39 @@ end function pop_char ! SEE ALSO ! pop_char ! +! HISTORY +! Jacob Williams : 5/3/2015 : replaced original version of this routine. +! ! SOURCE subroutine push_char(c) implicit none - character(kind=CK,len=1), intent(in) :: c + character(kind=CK,len=1),intent(in) :: c character(kind=CK,len=max_numeric_str_len) :: istr if (.not. exception_thrown) then - pushed_index = pushed_index + 1 + if (use_unformatted_stream) then + + !in this case, c is ignored, and we just + !decrement the stream position counter: + ipos = ipos - 1 - if (pushed_index>0 .and. pushed_index<=len(pushed_char)) then - pushed_char(pushed_index:pushed_index) = c else - call integer_to_string(pushed_index,istr) - call throw_exception('Error in push_char: '//& - 'invalid valid of pushed_index: '//trim(istr)) + + pushed_index = pushed_index + 1 + + if (pushed_index>0 .and. pushed_index<=len(pushed_char)) then + pushed_char(pushed_index:pushed_index) = c + else + call integer_to_string(pushed_index,istr) + call throw_exception('Error in push_char: '//& + 'invalid valid of pushed_index: '//trim(istr)) + end if + end if end if diff --git a/src/tests/jf_test_11.F90 b/src/tests/jf_test_11.F90 index 7a95a3bde7..2bf78521fb 100644 --- a/src/tests/jf_test_11.F90 +++ b/src/tests/jf_test_11.F90 @@ -70,7 +70,9 @@ subroutine test_11(error_cnt) integer,intent(out) :: error_cnt character(kind=CK,len=:),allocatable :: cval type(json_file) :: json !the JSON structure read from the file: +# ifdef USE_UCS4 type(json_file) :: clone +# endif error_cnt = 0 call json_initialize() From 432af672dfc46342c60ab4c754216dc823679a89 Mon Sep 17 00:00:00 2001 From: jacobwilliams Date: Sun, 3 May 2015 20:08:00 -0500 Subject: [PATCH 2/2] added an additional speed test, where file is read into a string first. --- src/tests/jf_test_9.f90 | 75 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/src/tests/jf_test_9.f90 b/src/tests/jf_test_9.f90 index 42d277bc91..fb0e857b68 100644 --- a/src/tests/jf_test_9.f90 +++ b/src/tests/jf_test_9.f90 @@ -48,6 +48,7 @@ module jf_test_9_mod use, intrinsic :: iso_fortran_env , only: error_unit, output_unit, wp => real64 implicit none + !small file - 0.0 sec : http://www.json-generator.com !character(len=*),parameter :: filename = 'random1.json' @@ -57,9 +58,13 @@ module jf_test_9_mod !13 MB - 7.6 sec : http://mtgjson.com !character(len=*),parameter :: filename = 'AllSets.json' + !....WARNING: this file is causing some error.... (bug in code?) !100 MB - takes forever... : https://github.com/seductiveapps/largeJSON !character(len=*),parameter :: filename = '100mb.json' + !small file that contains unicode characters: + !character(len=*),parameter :: filename = 'hello-world-ucs4.json' !!!! test !!!! + character(len=*),parameter :: dir = '../files/inputs/' !working directory contains @@ -74,6 +79,7 @@ subroutine test_9(error_cnt) type(json_file) :: f real :: tstart, tend + character(len=:),allocatable :: str error_cnt = 0 call json_initialize() @@ -84,7 +90,7 @@ subroutine test_9(error_cnt) write(error_unit,'(A)') '' write(error_unit,'(A)') '=================================' - write(error_unit,'(A)') ' EXAMPLE 9 ' + write(error_unit,'(A)') ' EXAMPLE 9a ' write(error_unit,'(A)') '=================================' write(error_unit,'(A)') '' @@ -108,8 +114,75 @@ subroutine test_9(error_cnt) !cleanup: call f%destroy() + write(error_unit,'(A)') '' + write(error_unit,'(A)') '=================================' + write(error_unit,'(A)') ' EXAMPLE 9b ' + write(error_unit,'(A)') '=================================' + + write(error_unit,'(A)') '' + write(error_unit,'(A)') ' Load a file using json_file%load_from_string' + write(error_unit,'(A)') '' + write(error_unit,'(A)') 'Loading file: '//trim(filename) + + call cpu_time(tstart) + call read_file(dir//filename, str) + + if (allocated(str)) then + call f%load_from_string(str) + call cpu_time(tend) + write(error_unit,'(A,1X,F10.3,1X,A)') 'Elapsed time to parse: ',tend-tstart,' sec' + if (json_failed()) then + call json_print_error_message(error_unit) + error_cnt = error_cnt + 1 + else + write(error_unit,'(A)') 'File successfully read' + end if + write(error_unit,'(A)') '' + !write(error_unit,'(A)') str !!!! test !!!! + !write(error_unit,'(A)') '' !!!! test !!!! + else + write(error_unit,'(A)') 'Error loading file' + end if + + !cleanup: + call f%destroy() + end subroutine test_9 + subroutine read_file(filename,str) + ! + ! Reads the contents of the file into the allocatable string str. + ! If there are any problems, str will be returned unallocated. + ! + + ! Will this routine work if the file contains unicode characters?? + + implicit none + + character(len=*),intent(in) :: filename + character(len=:),allocatable,intent(out) :: str + + integer :: iunit,istat,filesize + + open( newunit = iunit,& + file = filename,& + status = 'OLD',& + form = 'UNFORMATTED',& + access = 'STREAM',& + iostat = istat ) + + if (istat==0) then + inquire(file=filename, size=filesize) + if (filesize>0) then + allocate( character(len=filesize) :: str ) + read(iunit,pos=1,iostat=istat) str + if (istat/=0) deallocate(str) + close(iunit, iostat=istat) + end if + end if + + end subroutine read_file + end module jf_test_9_mod program jf_test_9