2025
|
Dopke, Luan; Accorsi, Arthur; Aires, João; Guder, Larissa; Manssour, Isabel; Griebler, Dalvan SpeechVis: Simplifying Speech Emotion Visualization Inproceedings doi In: Proceedings of the 31st Brazilian Symposium on Multimedia and the Web, pp. 428-436, SBC Rio de Janeiro, Brazil, 2025. @inproceedings{DOPKE:WebMedia:25,
title = {SpeechVis: Simplifying Speech Emotion Visualization },
author = {Luan Dopke and Arthur Accorsi and João Aires and Larissa Guder and Isabel Manssour and Dalvan Griebler},
url = {https://doi.org/10.5753/webmedia.2025.16115},
doi = {10.5753/webmedia.2025.16115},
year = {2025},
date = {2025-11-01},
booktitle = {Proceedings of the 31st Brazilian Symposium on Multimedia and the Web},
pages = {428-436},
address = {Rio de Janeiro, Brazil},
organization = {SBC},
abstract = {As the amount of online content increases, analyzing and following discussions becomes harder. Relevant information, such as the main discussion topics and the emotions expressed in audio, e.g., in a podcast, requires people to watch or listen to the entire content to understand the context. However, this can take a long time, and people’s interpretations of emotions can bias their understanding of them. A visual summarization of such information can help people quickly understand the audio context and analyze the content regarding speakers, their emotions, and the main topics covered. In this work, we introduce SpeechVis, a visual analytics tool that visually summarizes speech emotions from an audio source. SpeechVis extracts multiple information from the audio, such as the transcription, speakers, main topics, and emotions, to provide visualizations and statistics about the discussed topics and each speaker’s emotions. We used multiple off-the-shelf machine learning models to extract audio information and developed several visual representations that aim to facilitate audio analysis. To evaluate SpeechVis, we selected two use cases and performed an analysis to demonstrate how the SpeechVis visualizations can give valuable insights and facilitate audio interpretation.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
As the amount of online content increases, analyzing and following discussions becomes harder. Relevant information, such as the main discussion topics and the emotions expressed in audio, e.g., in a podcast, requires people to watch or listen to the entire content to understand the context. However, this can take a long time, and people’s interpretations of emotions can bias their understanding of them. A visual summarization of such information can help people quickly understand the audio context and analyze the content regarding speakers, their emotions, and the main topics covered. In this work, we introduce SpeechVis, a visual analytics tool that visually summarizes speech emotions from an audio source. SpeechVis extracts multiple information from the audio, such as the transcription, speakers, main topics, and emotions, to provide visualizations and statistics about the discussed topics and each speaker’s emotions. We used multiple off-the-shelf machine learning models to extract audio information and developed several visual representations that aim to facilitate audio analysis. To evaluate SpeechVis, we selected two use cases and performed an analysis to demonstrate how the SpeechVis visualizations can give valuable insights and facilitate audio interpretation. |
Guder, Larissa; Dopke, Luan; Kaiser, Marcos; Griebler, Dalvan; Meneguzzi, Felipe BAH: Beyond Acoustic Handcrafted features for speech emotion recognition in Portuguese Inproceedings doi In: Proceedings of the 31st Brazilian Symposium on Multimedia and the Web, pp. 86-93, SBC Rio de Janeiro, Brazil, 2025. @inproceedings{GUDER:WebMedia:25,
title = {BAH: Beyond Acoustic Handcrafted features for speech emotion recognition in Portuguese},
author = {Larissa Guder and Luan Dopke and Marcos Kaiser and Dalvan Griebler and Felipe Meneguzzi},
url = {https://doi.org/10.5753/webmedia.2025.16129},
doi = {10.5753/webmedia.2025.16129},
year = {2025},
date = {2025-11-01},
booktitle = {Proceedings of the 31st Brazilian Symposium on Multimedia and the Web},
pages = {86-93},
address = {Rio de Janeiro, Brazil},
organization = {SBC},
abstract = {It is through affective computing that we have the integration of human feelings and computing applications. One affective computing task is Speech Emotion Recognition (SER), which identifies emotions from spoken audio. Even though emotion is a universal aspect of human experience, each culture and language has different ways to express and understand emotions. So, when designing models for SER, it is common to focus on a single language. In this work, we explore VERBO, a Brazilian Portuguese dataset for categorical emotion recognition. Our main objective is to define the best way to extract acoustic features to train a classifier for SER.We compare 18 different methods to generate audio representations, grouped by handcrafted features and audio embeddings. The best representation for VERBO is TRILL embeddings, and with an SVM classifier, we achieved 92% accuracy in VERBO. As far as we know, this was the state of the art for this dataset.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
It is through affective computing that we have the integration of human feelings and computing applications. One affective computing task is Speech Emotion Recognition (SER), which identifies emotions from spoken audio. Even though emotion is a universal aspect of human experience, each culture and language has different ways to express and understand emotions. So, when designing models for SER, it is common to focus on a single language. In this work, we explore VERBO, a Brazilian Portuguese dataset for categorical emotion recognition. Our main objective is to define the best way to extract acoustic features to train a classifier for SER.We compare 18 different methods to generate audio representations, grouped by handcrafted features and audio embeddings. The best representation for VERBO is TRILL embeddings, and with an SVM classifier, we achieved 92% accuracy in VERBO. As far as we know, this was the state of the art for this dataset. |
Ahmad, Sunna Imtiaz; Olczyk, Jakub; Araújo, Adriel S.; de Moura Medeiros, João Pedro; Teixeira, Vinicius C.; Gomes, Carlos F. A.; Magnaguagno, Maurício Cecílio; Roederer, Quinn; Dutra, Vinicius; Conley, R. Scott; Griebler, Dalvan; Eckert, George; Pinho, Márcio Sarroglia; Turkkahraman, Hakan A Novel Multimodal Deep Image Analysis Model for Predicting Extraction/Non-Extraction Decision Journal Article doi In: Orthodontics & Craniofacial Research, vol. na, pp. na, 2025. @article{AHMAD:OCR:25,
title = {A Novel Multimodal Deep Image Analysis Model for Predicting Extraction/Non-Extraction Decision},
author = {Sunna Imtiaz Ahmad and Jakub Olczyk and Adriel S. Araújo and João Pedro de Moura Medeiros and Vinicius C. Teixeira and Carlos F. A. Gomes and Maurício Cecílio Magnaguagno and Quinn Roederer and Vinicius Dutra and R. Scott Conley and Dalvan Griebler and George Eckert and Márcio Sarroglia Pinho and Hakan Turkkahraman},
url = {https://doi.org/10.1111/ocr.70057},
doi = {10.1111/ocr.70057},
year = {2025},
date = {2025-10-01},
urldate = {2025-10-01},
journal = {Orthodontics & Craniofacial Research},
volume = {na},
pages = {na},
publisher = {Wiley},
abstract = {This study aimed to develop a deep learning model classifier capable of predicting the extraction/non-extraction binary decision using lateral cephalometric radiographs (LCRs) and intraoral scans (IOS) to serve as an additional decision-support tool for orthodontists. Materials and Methods The dataset was composed of LCRs and IOS from 617 patients (mean age: 18.2, 63.5% female) treated at the Indiana University School of Dentistry. Subjects were categorised into two groups: extraction (192) and non-extraction (425). Two sets of features were extracted from IOS: traditional arch measurements and novel tooth spatial features. For LCRs, features were derived using CephNet-based landmark detection (Land), a convolutional autoencoder (AE), and the dimensionality was reduced using Principal Component Analysis (PCA). Models were evaluated using accuracy, sensitivity, specificity, positive predictive value (PPV or precision), negative predictive value (NPV), positive likelihood ratio (LR+), negative likelihood ratio (LR−), and F1 score. Results IOS + Land model achieved the highest overall accuracy (77%) and F1 score (0.62), with strong specificity (83%) and PPV (62%). In contrast, the Land model yielded the highest sensitivity (82%), but at the cost of lower specificity (57%). McNemar's test revealed that the AE model was significantly less accurate than IOS + AE (p = 0.048), IOS + Land (p = 0.006), and IOS + AE + Land (p = 0.005). Conclusion Deep learning models can predict the extraction/non-extraction decision using IOS and LCRs with high accuracy and diagnostic performance. Multimodal approaches, particularly those integrating IOS with cephalometric landmarks, demonstrate superior accuracy, sensitivity, and specificity compared to single-modality models.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
This study aimed to develop a deep learning model classifier capable of predicting the extraction/non-extraction binary decision using lateral cephalometric radiographs (LCRs) and intraoral scans (IOS) to serve as an additional decision-support tool for orthodontists. Materials and Methods The dataset was composed of LCRs and IOS from 617 patients (mean age: 18.2, 63.5% female) treated at the Indiana University School of Dentistry. Subjects were categorised into two groups: extraction (192) and non-extraction (425). Two sets of features were extracted from IOS: traditional arch measurements and novel tooth spatial features. For LCRs, features were derived using CephNet-based landmark detection (Land), a convolutional autoencoder (AE), and the dimensionality was reduced using Principal Component Analysis (PCA). Models were evaluated using accuracy, sensitivity, specificity, positive predictive value (PPV or precision), negative predictive value (NPV), positive likelihood ratio (LR+), negative likelihood ratio (LR−), and F1 score. Results IOS + Land model achieved the highest overall accuracy (77%) and F1 score (0.62), with strong specificity (83%) and PPV (62%). In contrast, the Land model yielded the highest sensitivity (82%), but at the cost of lower specificity (57%). McNemar's test revealed that the AE model was significantly less accurate than IOS + AE (p = 0.048), IOS + Land (p = 0.006), and IOS + AE + Land (p = 0.005). Conclusion Deep learning models can predict the extraction/non-extraction decision using IOS and LCRs with high accuracy and diagnostic performance. Multimodal approaches, particularly those integrating IOS with cephalometric landmarks, demonstrate superior accuracy, sensitivity, and specificity compared to single-modality models. |
Araujo, Gabriell; Griebler, Dalvan; Fernandes, Luiz Gustavo Performance, Portability, and Productivity of HIP on GPUs with NAS Parallel Benchmarks Inproceedings doi In: 2025 IEEE/SBC 37th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD), pp. 204-214, IEEE, Bonito, Brazil, 2025. @inproceedings{ARAUJO:SBAC-PAD:25,
title = {Performance, Portability, and Productivity of HIP on GPUs with NAS Parallel Benchmarks},
author = {Gabriell Araujo and Dalvan Griebler and Luiz Gustavo Fernandes},
url = {https://doi.org/10.1109/SBAC-PAD66369.2025.00027},
doi = {10.1109/SBAC-PAD66369.2025.00027},
year = {2025},
date = {2025-10-01},
booktitle = {2025 IEEE/SBC 37th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)},
pages = {204-214},
publisher = {IEEE},
address = {Bonito, Brazil},
series = {SBAC-PAD'25},
abstract = {Graphics Processing Units (GPUs) are powerful, massively parallel processors that have become ubiquitous in modern computing. In recent years, the GPU market has diversified, with vendors like AMD and Intel offering high-performance alternatives to NVIDIA. However, most applications are written using NVIDIA's CUDA API, which is incompatible with non-NVIDIA GPUs, creating significant challenges for developers who must port their code to different architectures. To address this issue, AMD developed the Heterogeneous-Compute Interface for Portability (HIP), an open-source API for cross-vendor GPU programming. However, HIP is relatively new, leaving gaps in the literature regarding its performance, portability, and productivity. In this paper, we evaluate HIP using the NAS Parallel Benchmarks (NPB), a CFD-based suite maintained by NASA. We present the first HIP-based implementation of NPB and conduct experiments on integrated and discrete GPUs from NVIDIA, AMD, and Intel. Our results provide novel insights into HIP’s performance and portability, particularly for integrated GPUs and Intel discrete GPUs, which have been underrepresented in prior studies. We also assess productivity using different metrics to quantify the programming effort of HIP-based implementations. This work addresses key gaps in the literature, offering valuable data and insights for developers targeting emerging GPU architectures.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Graphics Processing Units (GPUs) are powerful, massively parallel processors that have become ubiquitous in modern computing. In recent years, the GPU market has diversified, with vendors like AMD and Intel offering high-performance alternatives to NVIDIA. However, most applications are written using NVIDIA's CUDA API, which is incompatible with non-NVIDIA GPUs, creating significant challenges for developers who must port their code to different architectures. To address this issue, AMD developed the Heterogeneous-Compute Interface for Portability (HIP), an open-source API for cross-vendor GPU programming. However, HIP is relatively new, leaving gaps in the literature regarding its performance, portability, and productivity. In this paper, we evaluate HIP using the NAS Parallel Benchmarks (NPB), a CFD-based suite maintained by NASA. We present the first HIP-based implementation of NPB and conduct experiments on integrated and discrete GPUs from NVIDIA, AMD, and Intel. Our results provide novel insights into HIP’s performance and portability, particularly for integrated GPUs and Intel discrete GPUs, which have been underrepresented in prior studies. We also assess productivity using different metrics to quantify the programming effort of HIP-based implementations. This work addresses key gaps in the literature, offering valuable data and insights for developers targeting emerging GPU architectures. |
Martins, Eduardo; Hoffmann, Renato; Alf, Lucas; Griebler, Dalvan Interface para Programação de Pipelines Lineares Tolerantes a Falha para MPI Padrão C++ Inproceedings doi In: Anais do XXVI Simpósio em Sistemas Computacionais de Alto Desempenho, pp. 133-144, SBC, Bonito, Brazil, 2025. @inproceedings{MARTINS:SSCAD:25,
title = {Interface para Programação de Pipelines Lineares Tolerantes a Falha para MPI Padrão C++},
author = {Eduardo Martins and Renato Hoffmann and Lucas Alf and Dalvan Griebler},
url = {https://doi.org/10.5753/sscad.2025.15867},
doi = {10.5753/sscad.2025.15867},
year = {2025},
date = {2025-10-01},
booktitle = {Anais do XXVI Simpósio em Sistemas Computacionais de Alto Desempenho},
pages = {133-144},
publisher = {SBC},
address = {Bonito, Brazil},
series = {SSCAD'25},
abstract = {Sistemas de processamento de stream são projetados para operar continuamente e devem ser capazes de se recuperar em caso de falhas. No entanto, programar aplicações de alto desempenho em ambientes distribuídos introduz uma alta complexidade de desenvolvimento. Este trabalho apresenta uma interface de programação que facilita a construção de pipelines lineares tolerantes a falhas para aplicações de processamento de stream em C++. A solução utiliza MPI (Message Passing Interface) para comunicação e o protocolo ABS (Asynchronous Barrier Snapshotting) juntamente com um agente monitor para a etapa de recuperação. Os resultados experimentais indicam uma redução significativa no tempo estimado de desenvolvimento para o programador, com impacto médio de -0.98% até 6.73% na vazão das aplicações. Além disso, o processo de recuperação mitiga o impacto das falhas na vazão do programa.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sistemas de processamento de stream são projetados para operar continuamente e devem ser capazes de se recuperar em caso de falhas. No entanto, programar aplicações de alto desempenho em ambientes distribuídos introduz uma alta complexidade de desenvolvimento. Este trabalho apresenta uma interface de programação que facilita a construção de pipelines lineares tolerantes a falhas para aplicações de processamento de stream em C++. A solução utiliza MPI (Message Passing Interface) para comunicação e o protocolo ABS (Asynchronous Barrier Snapshotting) juntamente com um agente monitor para a etapa de recuperação. Os resultados experimentais indicam uma redução significativa no tempo estimado de desenvolvimento para o programador, com impacto médio de -0.98% até 6.73% na vazão das aplicações. Além disso, o processo de recuperação mitiga o impacto das falhas na vazão do programa. |