Witam,
Ostatnio pisząc kod w assemblerze, który implementował filtr do zdjęć napotkałem problem. Próbując zloopować kod, tak, żeby bez wychodzenia z konsoli móc ponownie wypróbować filtr, wyrzuca mi wyjątek na wczytywaniu pikseli. Czy ma ktoś pomysł, jaka może być tego przyczyna?
Poniżej wklejam kawałki kodu. Z góry dziękuję za pomoc.
#include <iostream>
#include <fstream>
#include <thread>
#include <omp.h>
#include <chrono>
#include <mutex>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
using namespace cv;
extern "C" void alg(unsigned char* ptr, unsigned char* end_row, int width, int height, int start_row);
void algC(unsigned char* imdata, int width, int height, int channels, int stride) {
unsigned char* output = new unsigned char[(height + 2) * stride];
int kernel[3][3] = {
{ -1, 1, 1 },
{ -1, -2, 1 },
{ -1, 1, 1 }
};
for (int y = 1; y < height - 1; y++) {
for (int x = 1; x < width - 1; x++) {
for (int c = 0; c < channels; c++) {
int gradient = 0;
for (int ky = -1; ky <= 1; ky++) {
for (int kx = -1; kx <= 1; kx++) {
int pixelValue = imdata[(y + ky) * stride + (x + kx) * channels + c];
gradient += kernel[ky + 1][kx + 1] * pixelValue;
}
}
int result = gradient;
if (result < 0) result = 0;
else if (result > 255) result = 255;
output[y * stride + x * channels + c] = static_cast<unsigned char>(result);
}
}
}
for (int i = 0; i < height * stride; ++i) {
imdata[i] = output[i];
}
delete[] output;
}
// Function to process image sections using the C-based algorithm
void process_image_section_c(unsigned char* imdata, int start_row, int end_row, int width, int channels, int stride) {
unsigned char* section_imdata = imdata + start_row * stride;
algC(section_imdata, width, end_row - start_row, channels, stride);
}
// Function to process image sections using the Assembly-based algorithm
void process_image_section_asm(unsigned char* imdata, int start_row, int end_row, int width, int channels, int stride) {
unsigned char* section_imdata = imdata + start_row * stride;
int height = end_row - start_row;
unsigned char* output = new unsigned char[height * stride];
alg(section_imdata, output, width, height, stride);
memcpy(section_imdata, output, height * stride);
delete[] output;
}
int main(int argc, char* argv[]) {
const char* fileName = "C:/Users/machm/OneDrive - Politechnika Śląska/Pictures/lena.jpg";
Mat image = imread(fileName, IMREAD_COLOR);
if (image.empty()) {
std::cerr << "Error: Could not load image." << std::endl;
return -1;
}
cv::namedWindow("Before Processing", WINDOW_NORMAL);
cv::imshow("Before Processing", image);
waitKey(0);
int width = image.cols;
int height = image.rows;
int channels = image.channels();
int stride = image.step;
while (true) {
Mat imcopy = image.clone();
unsigned char* imcopydata = imcopy.data;
std::cout << "Choose algorithm to process the image:\n";
std::cout << "1. C++ Algorithm\n";
std::cout << "2. Assembly Algorithm\n";
std::cout << "Enter your choice (1 or 2): ";
int choice;
std::cin >> choice;
if (choice != 1 && choice != 2) {
std::cerr << "Invalid choice! Please select 1 or 2." << std::endl;
continue;
}
int iterations = 1;
double totalTime = 0.0;
for (int i = 0; i < iterations; i++) {
Mat iterCopy = image.clone();
unsigned char* iterCopyData = iterCopy.data;
int num_threads = 1; // Number of threads
std::vector<std::thread> threads;
int rows_per_thread = height / num_threads;
int remaining_rows = height % num_threads;
auto start = std::chrono::high_resolution_clock::now();
for (int t = 0; t < num_threads; t++) {
int start_row = t * rows_per_thread + std::min(t, remaining_rows);
int end_row = start_row + rows_per_thread + (t < remaining_rows ? 1 : 0);
if (choice == 1) {
threads.emplace_back(process_image_section_c, iterCopyData, start_row, end_row, width, channels, stride);
} else {
threads.emplace_back(process_image_section_asm, iterCopyData, start_row, end_row, width, channels, stride);
}
}
for (auto& thread : threads) {
thread.join();
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = end - start;
totalTime += std::chrono::duration<double>(duration).count();
// Display processed image for the first iteration only
if (i == 0) {
cv::namedWindow("Processed", WINDOW_NORMAL);
cv::imshow("Processed", iterCopy);
waitKey(0);
}
}
double averageTime = totalTime / iterations;
std::cout << "Total Time for " << iterations << " iterations: " << totalTime << "s\n";
std::cout << "Average Time per iteration: " << averageTime << "s\n";
char again;
std::cout << "Do you want to run another test? (y/n): ";
std::cin >> again;
if (again == 'n' || again == 'N') {
break;
}
}
return 0;
}
.DATA
zero_float dd 0.0, 0.0, 0.0, 0.0 ; 4x float 0.0
max_float dd 255.0, 255.0, 255.0, 255.0 ; 4x float 255.0
minus_one dd -1.0, -1.0, -1.0, -1.0
minus_two dd -2.0, -2.0, -2.0, -2.0
;===================PLAN=====================
;przetwarzac jeden piksel na raz
;1 kanal = 4 bajty (?)
;maske zrobic tak jak clampy
;dodawac do jakiegos innego rejestru
;wsadzac do outputu
;mozna na slowach
;lepiej nie na floatach
.CODE
alg PROC
push rbp
mov rbp, rsp
push rbx
push rsi
push rdi
push r12
push r13
push r14
push r15
; data
mov r15, rcx ; r15 = imdata (wskaźnik na dane obrazu)
mov r11, r8 ; r11 = width
mov r12, r9 ; r12 = height
mov r13, [rsp+104] ; r13 = stride (bajty na wiersz)
mov r14, rdx ; r14 = output
sub r12, 1 ; wysokosc -1
sub r11, 1
mov rbx, 1 ; zaczynamy od pierwszego wiersza
; dane do mnozenia/clampowania
movaps xmm0, [zero_float]
movaps xmm1, [max_float]
movaps xmm2, [minus_one]
movaps xmm3, [minus_two]
row_loop:
cmp rbx, r12
je end_loop ; jesli wiersz==wysokosc -> koniec
mov rdi, 1 ; zaczynamy od pierwszej kolumny
col_loop:
cmp rdi, r11
je next_row ; jesli kolumna==szerokosc -> nastepny wiersz
mov rax, rbx ; rax = y
imul rax, r13 ; rax = y * stride
mov rsi, rdi ; rsi = x
imul rsi, 3 ; rsi = x * channels
add rax, rsi ; rax = y * stride + x * channels
mov rcx, rax ; zapisz wskaznik na piksel w rcx
lea rax, [r15 + rax] ; wskaznik na obecny piksel w rax
xorps xmm13, xmm13 ; rejestr do przetrzymywania sumy kanalow
movd xmm4, dword ptr[rax] ; centralny piksel w xmm4
pxor xmm5, xmm5 ; zerowanie xmm5
punpcklbw xmm4, xmm5 ; bajty do slow
punpcklwd xmm4, xmm5 ; bajty do dslow
cvtdq2ps xmm4, xmm4 ; konwersja do float
mulps xmm4, [minus_two] ; mnozenie przez -2 kazdego kanalu
addps xmm13, xmm4 ; dodanie do xmm13 (tutaj bedzie suma wszystkich 3 kanalow)
sub rax, r13 ; wiersz do gory
sub rax, 3 ; piksel w lewo
movd xmm4, dword ptr[rax] ; lewy gorny piksel w xmm4
pxor xmm5, xmm5
punpcklbw xmm4, xmm5
punpcklwd xmm4, xmm5
cvtdq2ps xmm4, xmm4
mulps xmm4, [minus_one] ; mnozenie przez -1 kazdego kanalu
addps xmm13, xmm4 ; dodanie do xmm13
add rax, 3 ; piksel w prawo
movd xmm4, dword ptr[rax] ; gorny piksel w xmm4
pxor xmm5, xmm5
punpcklbw xmm4, xmm5
punpcklwd xmm4, xmm5
cvtdq2ps xmm4, xmm4
;mulps xmm4, [minus_one] ; mnozenie przez -1 kazdego kanalu
addps xmm13, xmm4 ; dodanie do xmm13
add rax, 3 ; piksel w prawo
movd xmm4, dword ptr[rax] ; prawy gorny piksel w xmm4
pxor xmm5, xmm5
punpcklbw xmm4, xmm5
punpcklwd xmm4, xmm5
cvtdq2ps xmm4, xmm4
addps xmm13, xmm4 ; dodanie do xmm13 (mnozenie przez 1)
add rax, r13 ; wiersz w dol
movd xmm4, dword ptr[rax] ; prawy piksel w xmm4
pxor xmm5, xmm5
punpcklbw xmm4, xmm5
punpcklwd xmm4, xmm5
cvtdq2ps xmm4, xmm4
addps xmm13, xmm4 ; dodanie do xmm13 (mnozenie przez 1)
sub rax, 6 ; dwa piksele w lewo
movd xmm4, dword ptr[rax] ; lewy piksel w xmm4
pxor xmm5, xmm5
punpcklbw xmm4, xmm5
punpcklwd xmm4, xmm5
cvtdq2ps xmm4, xmm4
mulps xmm4, [minus_one] ; mnozenie przez -1 kazdego kanalu
addps xmm13, xmm4 ; dodanie do xmm13
add rax, r13 ; wiersz w dol
movd xmm4, dword ptr[rax] ; lewy dolny piksel w xmm4
pxor xmm5, xmm5
punpcklbw xmm4, xmm5
punpcklwd xmm4, xmm5
cvtdq2ps xmm4, xmm4
mulps xmm4, [minus_one]
addps xmm13, xmm4 ; dodanie do xmm13 (mnozenie przez 1)
add rax, 3 ; piksel w prawo
movd xmm4, dword ptr[rax] ; dolny piksel w xmm4
pxor xmm5, xmm5
punpcklbw xmm4, xmm5
punpcklwd xmm4, xmm5
cvtdq2ps xmm4, xmm4
addps xmm13, xmm4 ; dodanie do xmm13 (mnozenie przez 1)
add rax, 3 ; piksel w prawo
movd xmm4, dword ptr[rax] ; prawy dolny piksel w xmm4
pxor xmm5, xmm5
punpcklbw xmm4, xmm5
punpcklwd xmm4, xmm5
cvtdq2ps xmm4, xmm4
addps xmm13, xmm4 ; dodanie do xmm13 (mnozenie przez 1)
cvttps2dq xmm7, xmm13 ; float to int
packusdw xmm7, xmm7 ; zapakowanie do slow
packuswb xmm7, xmm7 ; zapakowanie do bajtow
movd edx, xmm7 ; zapisanie do edx
lea rax, [r14 + rcx] ; wskaznik na piksel w buforze wyjsciowym
mov dword ptr [rax], edx ; zapisanie nowego piksela w buforze wyjsciowym
inc rdi ; inkrementacja kolumny
jmp col_loop
next_row:
inc rbx ; inkrementacja wiersza
jmp row_loop
end_loop:
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbx
pop rbp
ret
alg ENDP
END
;-------------------------------------------------------------------------
LIBRARY JADll
EXPORTS alg
;-------------------------------------------------------------------------