We are going to reuse the tesseract OCR code.
Create a new tesseract_ocr() helper and use it.
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
---
tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
tests/acceptance/tesseract_utils.py | 18 ++++++++++++++++++
2 files changed, 23 insertions(+), 16 deletions(-)
diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py
index 3c7400c43e4..09e2745cc52 100644
--- a/tests/acceptance/machine_m68k_nextcube.py
+++ b/tests/acceptance/machine_m68k_nextcube.py
@@ -7,13 +7,11 @@
import os
import time
-import logging
from avocado_qemu import Test
from avocado import skipUnless
-from avocado.utils import process
-from tesseract_utils import tesseract_available
+from tesseract_utils import tesseract_available, tesseract_ocr
PIL_AVAILABLE = True
try:
@@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):
def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
screenshot_path = os.path.join(self.workdir, "dump.ppm")
self.check_bootrom_framebuffer(screenshot_path)
-
- console_logger = logging.getLogger('console')
- text = process.run("tesseract %s stdout" % screenshot_path).stdout_text
- for line in text.split('\n'):
- if len(line):
- console_logger.debug(line)
+ lines = tesseract_ocr(screenshot_path, tesseract_version=3)
+ text = '\n'.join(lines)
self.assertIn('Backplane', text)
self.assertIn('Ethernet address', text)
@@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):
screenshot_path = os.path.join(self.workdir, "dump.ppm")
self.check_bootrom_framebuffer(screenshot_path)
-
- console_logger = logging.getLogger('console')
- proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)
- text = proc.stdout_text
- for line in text.split('\n'):
- if len(line):
- console_logger.debug(line)
+ lines = tesseract_ocr(screenshot_path, tesseract_version=4)
+ text = '\n'.join(lines)
self.assertIn('Testing the FPU, SCC', text)
self.assertIn('System test failed. Error code', text)
self.assertIn('Boot command', text)
diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py
index acd6e8c2faa..72cd9ab7989 100644
--- a/tests/acceptance/tesseract_utils.py
+++ b/tests/acceptance/tesseract_utils.py
@@ -6,7 +6,9 @@
# later. See the COPYING file in the top-level directory.
import re
+import logging
+from avocado.utils import process
from avocado.utils.path import find_command, CmdNotFoundError
def tesseract_available(expected_version):
@@ -26,3 +28,19 @@ def tesseract_available(expected_version):
return False
# now this is guaranteed to be a digit
return int(match.groups()[0]) == expected_version
+
+
+def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
+ console_logger = logging.getLogger('tesseract')
+ console_logger.debug(image_path)
+ if tesseract_version == 4:
+ tesseract_args += ' --oem 1'
+ proc = process.run("tesseract {} {} stdout".format(tesseract_args,
+ image_path))
+ lines = []
+ for line in proc.stdout_text.split('\n'):
+ sline = line.strip()
+ if len(sline):
+ console_logger.debug(sline)
+ lines += [sline]
+ return lines
--
2.26.2
On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
> We are going to reuse the tesseract OCR code.
> Create a new tesseract_ocr() helper and use it.
>
> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
> ---
> tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
> tests/acceptance/tesseract_utils.py | 18 ++++++++++++++++++
> 2 files changed, 23 insertions(+), 16 deletions(-)
>
> diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py
> index 3c7400c43e4..09e2745cc52 100644
> --- a/tests/acceptance/machine_m68k_nextcube.py
> +++ b/tests/acceptance/machine_m68k_nextcube.py
> @@ -7,13 +7,11 @@
>
> import os
> import time
> -import logging
>
> from avocado_qemu import Test
> from avocado import skipUnless
> -from avocado.utils import process
>
> -from tesseract_utils import tesseract_available
> +from tesseract_utils import tesseract_available, tesseract_ocr
>
> PIL_AVAILABLE = True
> try:
> @@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):
> def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
> screenshot_path = os.path.join(self.workdir, "dump.ppm")
> self.check_bootrom_framebuffer(screenshot_path)
> -
> - console_logger = logging.getLogger('console')
> - text = process.run("tesseract %s stdout" % screenshot_path).stdout_text
> - for line in text.split('\n'):
> - if len(line):
> - console_logger.debug(line)
> + lines = tesseract_ocr(screenshot_path, tesseract_version=3)
> + text = '\n'.join(lines)
> self.assertIn('Backplane', text)
> self.assertIn('Ethernet address', text)
>
> @@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
> def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):
> screenshot_path = os.path.join(self.workdir, "dump.ppm")
> self.check_bootrom_framebuffer(screenshot_path)
> -
> - console_logger = logging.getLogger('console')
> - proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)
> - text = proc.stdout_text
> - for line in text.split('\n'):
> - if len(line):
> - console_logger.debug(line)
> + lines = tesseract_ocr(screenshot_path, tesseract_version=4)
> + text = '\n'.join(lines)
> self.assertIn('Testing the FPU, SCC', text)
> self.assertIn('System test failed. Error code', text)
> self.assertIn('Boot command', text)
> diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py
> index acd6e8c2faa..72cd9ab7989 100644
> --- a/tests/acceptance/tesseract_utils.py
> +++ b/tests/acceptance/tesseract_utils.py
> @@ -6,7 +6,9 @@
> # later. See the COPYING file in the top-level directory.
>
> import re
> +import logging
>
> +from avocado.utils import process
> from avocado.utils.path import find_command, CmdNotFoundError
>
> def tesseract_available(expected_version):
> @@ -26,3 +28,19 @@ def tesseract_available(expected_version):
> return False
> # now this is guaranteed to be a digit
> return int(match.groups()[0]) == expected_version
> +
> +
> +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
> + console_logger = logging.getLogger('tesseract')
> + console_logger.debug(image_path)
> + if tesseract_version == 4:
> + tesseract_args += ' --oem 1'
> + proc = process.run("tesseract {} {} stdout".format(tesseract_args,
> + image_path))
> + lines = []
> + for line in proc.stdout_text.split('\n'):
> + sline = line.strip()
> + if len(sline):
> + console_logger.debug(sline)
> + lines += [sline]
> + return lines
Would it make sense to completely hide the tesseract version handling in
this new tesseract_utils.py file now, so that the tests themselves do not
have to worry about this anymore (i.e. would it be possible to merge
test_bootrom_framebuffer_ocr_with_tesseract_v3 and
test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test that way?)
Thomas
On 24/10/2020 08.35, Thomas Huth wrote:
> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
>> We are going to reuse the tesseract OCR code.
>> Create a new tesseract_ocr() helper and use it.
>>
>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
>> ---
>> tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
>> tests/acceptance/tesseract_utils.py | 18 ++++++++++++++++++
>> 2 files changed, 23 insertions(+), 16 deletions(-)
>>
>> diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py
>> index 3c7400c43e4..09e2745cc52 100644
>> --- a/tests/acceptance/machine_m68k_nextcube.py
>> +++ b/tests/acceptance/machine_m68k_nextcube.py
>> @@ -7,13 +7,11 @@
>>
>> import os
>> import time
>> -import logging
>>
>> from avocado_qemu import Test
>> from avocado import skipUnless
>> -from avocado.utils import process
>>
>> -from tesseract_utils import tesseract_available
>> +from tesseract_utils import tesseract_available, tesseract_ocr
>>
>> PIL_AVAILABLE = True
>> try:
>> @@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):
>> def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
>> screenshot_path = os.path.join(self.workdir, "dump.ppm")
>> self.check_bootrom_framebuffer(screenshot_path)
>> -
>> - console_logger = logging.getLogger('console')
>> - text = process.run("tesseract %s stdout" % screenshot_path).stdout_text
>> - for line in text.split('\n'):
>> - if len(line):
>> - console_logger.debug(line)
>> + lines = tesseract_ocr(screenshot_path, tesseract_version=3)
>> + text = '\n'.join(lines)
>> self.assertIn('Backplane', text)
>> self.assertIn('Ethernet address', text)
>>
>> @@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
>> def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):
>> screenshot_path = os.path.join(self.workdir, "dump.ppm")
>> self.check_bootrom_framebuffer(screenshot_path)
>> -
>> - console_logger = logging.getLogger('console')
>> - proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)
>> - text = proc.stdout_text
>> - for line in text.split('\n'):
>> - if len(line):
>> - console_logger.debug(line)
>> + lines = tesseract_ocr(screenshot_path, tesseract_version=4)
>> + text = '\n'.join(lines)
>> self.assertIn('Testing the FPU, SCC', text)
>> self.assertIn('System test failed. Error code', text)
>> self.assertIn('Boot command', text)
>> diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py
>> index acd6e8c2faa..72cd9ab7989 100644
>> --- a/tests/acceptance/tesseract_utils.py
>> +++ b/tests/acceptance/tesseract_utils.py
>> @@ -6,7 +6,9 @@
>> # later. See the COPYING file in the top-level directory.
>>
>> import re
>> +import logging
>>
>> +from avocado.utils import process
>> from avocado.utils.path import find_command, CmdNotFoundError
>>
>> def tesseract_available(expected_version):
>> @@ -26,3 +28,19 @@ def tesseract_available(expected_version):
>> return False
>> # now this is guaranteed to be a digit
>> return int(match.groups()[0]) == expected_version
>> +
>> +
>> +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
>> + console_logger = logging.getLogger('tesseract')
>> + console_logger.debug(image_path)
>> + if tesseract_version == 4:
>> + tesseract_args += ' --oem 1'
>> + proc = process.run("tesseract {} {} stdout".format(tesseract_args,
>> + image_path))
>> + lines = []
>> + for line in proc.stdout_text.split('\n'):
>> + sline = line.strip()
>> + if len(sline):
>> + console_logger.debug(sline)
>> + lines += [sline]
>> + return lines
>
> Would it make sense to completely hide the tesseract version handling in
> this new tesseract_utils.py file now, so that the tests themselves do not
> have to worry about this anymore (i.e. would it be possible to merge
> test_bootrom_framebuffer_ocr_with_tesseract_v3 and
> test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test that way?)
If I've got that right, there is also now a proper release 4 of Tesseract,
so maybe we can simply scratch the testing with version 3 now?
Thomas
On 10/24/20 8:40 AM, Thomas Huth wrote:
> On 24/10/2020 08.35, Thomas Huth wrote:
>> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
>>> We are going to reuse the tesseract OCR code.
>>> Create a new tesseract_ocr() helper and use it.
>>>
>>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
>>> ---
>>> tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
>>> tests/acceptance/tesseract_utils.py | 18 ++++++++++++++++++
>>> 2 files changed, 23 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py
>>> index 3c7400c43e4..09e2745cc52 100644
>>> --- a/tests/acceptance/machine_m68k_nextcube.py
>>> +++ b/tests/acceptance/machine_m68k_nextcube.py
>>> @@ -7,13 +7,11 @@
>>>
>>> import os
>>> import time
>>> -import logging
>>>
>>> from avocado_qemu import Test
>>> from avocado import skipUnless
>>> -from avocado.utils import process
>>>
>>> -from tesseract_utils import tesseract_available
>>> +from tesseract_utils import tesseract_available, tesseract_ocr
>>>
>>> PIL_AVAILABLE = True
>>> try:
>>> @@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):
>>> def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
>>> screenshot_path = os.path.join(self.workdir, "dump.ppm")
>>> self.check_bootrom_framebuffer(screenshot_path)
>>> -
>>> - console_logger = logging.getLogger('console')
>>> - text = process.run("tesseract %s stdout" % screenshot_path).stdout_text
>>> - for line in text.split('\n'):
>>> - if len(line):
>>> - console_logger.debug(line)
>>> + lines = tesseract_ocr(screenshot_path, tesseract_version=3)
>>> + text = '\n'.join(lines)
>>> self.assertIn('Backplane', text)
>>> self.assertIn('Ethernet address', text)
>>>
>>> @@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
>>> def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):
>>> screenshot_path = os.path.join(self.workdir, "dump.ppm")
>>> self.check_bootrom_framebuffer(screenshot_path)
>>> -
>>> - console_logger = logging.getLogger('console')
>>> - proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)
>>> - text = proc.stdout_text
>>> - for line in text.split('\n'):
>>> - if len(line):
>>> - console_logger.debug(line)
>>> + lines = tesseract_ocr(screenshot_path, tesseract_version=4)
>>> + text = '\n'.join(lines)
>>> self.assertIn('Testing the FPU, SCC', text)
>>> self.assertIn('System test failed. Error code', text)
>>> self.assertIn('Boot command', text)
>>> diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py
>>> index acd6e8c2faa..72cd9ab7989 100644
>>> --- a/tests/acceptance/tesseract_utils.py
>>> +++ b/tests/acceptance/tesseract_utils.py
>>> @@ -6,7 +6,9 @@
>>> # later. See the COPYING file in the top-level directory.
>>>
>>> import re
>>> +import logging
>>>
>>> +from avocado.utils import process
>>> from avocado.utils.path import find_command, CmdNotFoundError
>>>
>>> def tesseract_available(expected_version):
>>> @@ -26,3 +28,19 @@ def tesseract_available(expected_version):
>>> return False
>>> # now this is guaranteed to be a digit
>>> return int(match.groups()[0]) == expected_version
>>> +
>>> +
>>> +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
>>> + console_logger = logging.getLogger('tesseract')
>>> + console_logger.debug(image_path)
>>> + if tesseract_version == 4:
>>> + tesseract_args += ' --oem 1'
>>> + proc = process.run("tesseract {} {} stdout".format(tesseract_args,
>>> + image_path))
>>> + lines = []
>>> + for line in proc.stdout_text.split('\n'):
>>> + sline = line.strip()
>>> + if len(sline):
>>> + console_logger.debug(sline)
>>> + lines += [sline]
>>> + return lines
>>
>> Would it make sense to completely hide the tesseract version handling in
>> this new tesseract_utils.py file now, so that the tests themselves do not
>> have to worry about this anymore
Yes, good idea.
> (i.e. would it be possible to merge
>> test_bootrom_framebuffer_ocr_with_tesseract_v3 and
>> test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test that way?)
>
> If I've got that right, there is also now a proper release 4 of Tesseract,
> so maybe we can simply scratch the testing with version 3 now?
Good to know, I'll have a look. Thanks!
>
> Thomas
>
On 10/24/20 9:37 AM, Philippe Mathieu-Daudé wrote:
> On 10/24/20 8:40 AM, Thomas Huth wrote:
>> On 24/10/2020 08.35, Thomas Huth wrote:
>>> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
>>>> We are going to reuse the tesseract OCR code.
>>>> Create a new tesseract_ocr() helper and use it.
>>>>
>>>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
>>>> ---
>>>> tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
>>>> tests/acceptance/tesseract_utils.py | 18 ++++++++++++++++++
>>>> 2 files changed, 23 insertions(+), 16 deletions(-)
...
>>>
>>> Would it make sense to completely hide the tesseract version handling in
>>> this new tesseract_utils.py file now, so that the tests themselves do
>>> not
>>> have to worry about this anymore
The problem is the recognized strings differ between versions,
see in tests/acceptance/machine_m68k_nextcube.py:
lines = tesseract_ocr(screenshot_path, tesseract_version=3)
text = '\n'.join(lines)
self.assertIn('Backplane', text)
self.assertIn('Ethernet address', text)
and:
lines = tesseract_ocr(screenshot_path, tesseract_version=4)
text = '\n'.join(lines)
self.assertIn('Testing the FPU, SCC', text)
self.assertIn('System test failed. Error code', text)
self.assertIn('Boot command', text)
self.assertIn('Next>', text)
>
> Yes, good idea.
>
>> (i.e. would it be possible to merge
>>> test_bootrom_framebuffer_ocr_with_tesseract_v3 and
>>> test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test
>>> that way?)
>>
>> If I've got that right, there is also now a proper release 4 of
>> Tesseract,
>> so maybe we can simply scratch the testing with version 3 now?
>
> Good to know, I'll have a look. Thanks!
>
>>
>> Thomas
>>
>
On 24/10/2020 19.40, Philippe Mathieu-Daudé wrote:
> On 10/24/20 9:37 AM, Philippe Mathieu-Daudé wrote:
>> On 10/24/20 8:40 AM, Thomas Huth wrote:
>>> On 24/10/2020 08.35, Thomas Huth wrote:
>>>> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
>>>>> We are going to reuse the tesseract OCR code.
>>>>> Create a new tesseract_ocr() helper and use it.
>>>>>
>>>>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
>>>>> ---
>>>>> tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
>>>>> tests/acceptance/tesseract_utils.py | 18 ++++++++++++++++++
>>>>> 2 files changed, 23 insertions(+), 16 deletions(-)
> ...
>
>>>>
>>>> Would it make sense to completely hide the tesseract version handling in
>>>> this new tesseract_utils.py file now, so that the tests themselves do not
>>>> have to worry about this anymore
>
> The problem is the recognized strings differ between versions,
> see in tests/acceptance/machine_m68k_nextcube.py:
>
> lines = tesseract_ocr(screenshot_path, tesseract_version=3)
> text = '\n'.join(lines)
> self.assertIn('Backplane', text)
> self.assertIn('Ethernet address', text)
>
> and:
>
> lines = tesseract_ocr(screenshot_path, tesseract_version=4)
> text = '\n'.join(lines)
> self.assertIn('Testing the FPU, SCC', text)
> self.assertIn('System test failed. Error code', text)
> self.assertIn('Boot command', text)
> self.assertIn('Next>', text)
Ah, right, I forgot about that ... well, one more reason to completely
switch to tesseract v4 now ;-)
Thomas
© 2016 - 2026 Red Hat, Inc.